# Model selection

The data strongly prioritized numeric features, many of which exhibited non-linear relationships. SHAP feature importance confirmed that the model valued the squared features, suggesting that a tree-based model would be well-suited to capturing these patterns without requiring explicit feature engineering.

Models to test:
1. Random forrest
2. XGBoost
3. LightGBM

Given the small size of the dataset, I expect Random Forest to perform the best, as it typically generalizes better on limited data compared to boosting methods, which may overfit more easily under such constraints.

In [20]:
import pandas as pd
import numpy as np
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils.validation import check_is_fitted

In [21]:
url = "https://raw.githubusercontent.com/CarsonShively/Heart-Failure/refs/heads/main/data/heart_failure.csv"
df = pd.read_csv(url)
df.drop_duplicates(inplace=True)

X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Custom Casses

In [22]:
class ConvertToInt64(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.feature_names_in_ = X.columns.tolist()
        return self

    def set_output(self, *, transform=None):
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = pd.DataFrame(X, columns=self.feature_names_in_).copy()

        for col in X.columns:
            X[col] = pd.to_numeric(X[col], errors='coerce').astype("Int64")
            X[col] = X[col].where(X[col].isin([0, 1]), pd.NA)

        return pd.DataFrame(X, columns=self.feature_names_in_, index=X.index)


class ConvertToint64(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.feature_names_in_ = X.columns.tolist()
        return self

    def set_output(self, *, transform=None):
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = pd.DataFrame(X, columns=self.feature_names_in_).copy()

        for col in X.columns:
            X[col] = X[col].astype(int)

        return pd.DataFrame(X, columns=self.feature_names_in_, index=X.index)

class CoerceToFloat(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.feature_names_in_ = X.columns.tolist()
        return self

    def set_output(self, *, transform=None):
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = pd.DataFrame(X, columns=self.feature_names_in_).copy()

        for col in X.columns:
            X[col] = X[col].astype(float)

        return pd.DataFrame(X, columns=self.feature_names_in_, index=X.index)

class OutlierClipper(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, lower_quantile=0.01, upper_quantile=0.99):
        self.columns = columns
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.feature_names_in_ = X.columns.tolist()

        self.lower_bounds_ = {
            col: X[col].quantile(self.lower_quantile) for col in X.columns
        }
        self.upper_bounds_ = {
            col: X[col].quantile(self.upper_quantile) for col in X.columns
        }
        return self

    def set_output(self, *, transform=None):
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = pd.DataFrame(X).copy()

        for col in X.columns:
            X[col] = X[col].clip(self.lower_bounds_[col], self.upper_bounds_[col])

        return pd.DataFrame(X, columns=self.feature_names_in_, index=X.index)

# Binary features for tree baased models
Binary features were already mapped to 0 and 1, so the only preprocessing step required was:

NaN Handling: Missing values were imputed with safe defaults (e.g., 0 or mode where appropriate).

This minimal preprocessing ensures clarity and consistency across tree-based model comparisons.

In [23]:
binary_pipeline = Pipeline(steps=[
    ('Int', ConvertToInt64()),
    ('impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('int', ConvertToint64())
])

sex_pipeline = Pipeline(steps=[
    ('Int', ConvertToInt64()),
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('int', ConvertToint64())
])

# Numeric features for tree based models
To ensure a simple and consistent preprocessing pipeline for model comparison, I limited transformations to:

1. NaN Handling: Missing values were imputed using the median.

2. Outlier Treatment: Extreme values were clipped at the 1st and 99th percentiles.

This avoids overfitting through complex transformations and keeps the pipeline fair across all models.

In [18]:
numeric_pipeline = Pipeline([
    ('float', CoerceToFloat()),
    ('impute', SimpleImputer(strategy='median')),
    ('clip', OutlierClipper()),
])

# Pipeline

In [24]:
binary_cols = [
    'anaemia', 'diabetes', 'high_blood_pressure', 'smoking'
]

numeric_features = [
    'age',
    'creatinine_phosphokinase',
    'ejection_fraction',
    'platelets',
    'serum_creatinine',
    'serum_sodium',
    'time',
]

preprocessor = ColumnTransformer([
    ('bin', binary_pipeline, binary_cols),
    ('sex', sex_pipeline, ['sex']),
    ('num', numeric_pipeline, numeric_features)
])

preprocessor.set_output(transform='pandas')

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

tree_models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=weights[1] / weights[0], verbosity=0, random_state=42),
    "LightGBM": LGBMClassifier(class_weight='balanced', random_state=42, verbose=-1)
}

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

results = []

for name, model in tree_models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    pipeline.set_output(transform='pandas')

    scores = cross_validate(pipeline, X_train, y_train, cv=cv, scoring=scoring)

    summary = {
        'Model': name,
        **{f"{metric} Mean": scores[f"test_{metric}"].mean() for metric in scoring},
        **{f"{metric} Std": scores[f"test_{metric}"].std() for metric in scoring}
    }
    results.append(summary)

results_df = pd.DataFrame(results)
print(results_df)

           Model  accuracy Mean  precision Mean  recall Mean   f1 Mean  \
0  Random Forest       0.849645        0.843636     0.675000  0.745924   
1        XGBoost       0.841312        0.781822     0.715000  0.745710   
2       LightGBM       0.832890        0.785761     0.701667  0.732190   

   roc_auc Mean  accuracy Std  precision Std  recall Std    f1 Std  \
0      0.905236      0.047745       0.136405    0.043381  0.071081   
1      0.905821      0.046204       0.096192    0.047770  0.066913   
2      0.902520      0.052184       0.146104    0.088530  0.073878   

   roc_auc Std  
0     0.025187  
1     0.027031  
2     0.027179  


# Summary

As I expected, Random Forest outperformed both XGBoost and LightGBM on this small dataset. While LightGBM typically excels with larger data, it underperformed here. XGBoost was competitive but slightly less effective across recall and F1. Given the strong balance Random Forest showed, especially in generalization and recall, it is the optimal choice for the final model.