# Baseline model

The data had multiple linear features making logistic regression a good for baseline interpretability.

In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.pipeline")
import pandas as pd
import numpy as np
import shap
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    make_scorer,
    confusion_matrix
)
from sklearn.utils.validation import check_is_fitted

In [2]:
url = "https://raw.githubusercontent.com/CarsonShively/Heart-Failure/refs/heads/main/data/heart_failure.csv"
df = pd.read_csv(url)
df.drop_duplicates(inplace=True)

X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# Custom classes
1. Binary to Int64
2. Back to int64
3. To float64
4. Clip outliers

In [3]:
class ConvertToInt64(BaseEstimator, TransformerMixin):
    def set_output(self, *, transform=None):
        return self

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()

        for col in X.columns:
            X[col] = pd.to_numeric(X[col], errors='coerce').astype('Int64')
            X[col] = X[col].where(X[col].isin([0, 1]), pd.NA)

        return X

class ConvertToint64(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def set_output(self, *, transform=None):
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()

        for col in X.columns:
            X[col] = X[col].astype(int)

        return X

class CoerceToFloat(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns.tolist()
        return self

    def set_output(self, *, transform=None):
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = X.copy()
        for col in X.columns:
            X[col] = X[col].astype(float)
        return X

class OutlierClipper(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, lower_quantile=0.01, upper_quantile=0.99):
        self.columns = columns
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.feature_names_in_ = X.columns.tolist()
        self.lower_bounds_ = {
            col: X[col].quantile(self.lower_quantile) for col in X.columns
        }
        self.upper_bounds_ = {
            col: X[col].quantile(self.upper_quantile) for col in X.columns
        }
        return self

    def set_output(self, *, transform=None):
        return self

    def transform(self, X):
        check_is_fitted(self)
        X = pd.DataFrame(X).copy()

        for col in X.columns:
            X[col] = X[col].clip(self.lower_bounds_[col], self.upper_bounds_[col])

        return X


# Binary features

1. Make all values that are not 0 or 1 classified as < NaN > for imputation.
2. 0 represents the absence of a characteristic, making it a safe default for imputation.
3. Convert back to int for consistency.

In [4]:
binary_pipeline = Pipeline(steps=[
    ('Int', ConvertToInt64()),
    ('impute', SimpleImputer(strategy='constant', fill_value=0)),
    ('int', ConvertToint64())
])

# Sex feature
1. Convert to Int64 for unified nans
2. mode makes sense for a sex feature
3. back to int64 for consitency

In [5]:
sex_pipeline = Pipeline(steps=[
    ('Int', ConvertToInt64()),
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('int', ConvertToint64())
])

# Numeric features
## Basic
1. convert to float for unified nans for imputing
2. mean is a safe imputation strategy
3. clip at th e(0.01) and (0.99) quantiles
4. scale

## log featrues
1. convert to float for unified nans for imputing
2. mean is a safe imputation strategy
3. log transform to handle skewed features
3. clip at th e(0.01) and (0.99) quantiles
4. scale

In [6]:
basic_numeric_pipeline = Pipeline([
    ('float', CoerceToFloat()),
    ('impute', SimpleImputer(strategy='median')),
    ('clip', OutlierClipper()),
    ('scale', StandardScaler())
])

log_numeric_pipeline = Pipeline([
    ('float', CoerceToFloat()),
    ('impute', SimpleImputer(strategy='median')),
    ('log_transform', FunctionTransformer(
    func=np.log1p,
    feature_names_out='one-to-one'
    )),
    ('clip', OutlierClipper()),
    ('scale', StandardScaler())
])

# Pipeline

In [7]:
binary_cols = [
    'anaemia', 'diabetes', 'high_blood_pressure', 'smoking'
]

basic_numeric_features = [
    'age',
    'ejection_fraction',
    'serum_sodium',
    'time'
]

log_numeric_features = [
    'creatinine_phosphokinase',
    'platelets',
    'serum_creatinine'
]

preprocessor = ColumnTransformer([
    ('bin', binary_pipeline, binary_cols),
    ('sex', sex_pipeline, ['sex']),
    ('basic_num', basic_numeric_pipeline, basic_numeric_features),
    ('log_num', log_numeric_pipeline, log_numeric_features),
])


full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(class_weight='balanced', random_state=42))
])

full_pipeline.set_output(transform="pandas")
_ = full_pipeline

# Evaluate model
1. I used a locked cross-validation for a deterministic evaluation.
2. I applied the model to the hold-out test set to assess generalization.



In [8]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

scores = cross_validate(
    estimator=full_pipeline,
    X=X_train,
    y=y_train,
    cv=cv,
    scoring=scoring,
    return_train_score=False
)

cv_results = pd.DataFrame(scores)

print("Cross-Validation:")
for metric in scoring:
    mean_score = cv_results[f'test_{metric}'].mean()
    std_score = cv_results[f'test_{metric}'].std()
    print(f"{metric:>10}: {mean_score:.4f} ± {std_score:.4f}")

full_pipeline.fit(X_train, y_train)

y_pred = full_pipeline.predict(X_val)
y_proba = full_pipeline.predict_proba(X_val)[:, 1]

print("Hold-out Set:")
print(f"  Accuracy : {accuracy_score(y_val, y_pred):.4f}")
print(f"  Precision: {precision_score(y_val, y_pred):.4f}")
print(f"  Recall   : {recall_score(y_val, y_pred):.4f}")
print(f"  F1 Score : {f1_score(y_val, y_pred):.4f}")
print(f"  ROC AUC  : {roc_auc_score(y_val, y_proba):.4f}")
conf_mat = confusion_matrix(y_val, y_pred)
print("\nConfusion Matrix:")
print(conf_mat)



Cross-Validation:
  accuracy: 0.8035 ± 0.0472
 precision: 0.6680 ± 0.0753
    recall: 0.7917 ± 0.0332
        f1: 0.7236 ± 0.0563
   roc_auc: 0.8845 ± 0.0305
Hold-out Set:
  Accuracy : 0.7833
  Precision: 0.6875
  Recall   : 0.5789
  F1 Score : 0.6286
  ROC AUC  : 0.8639

Confusion Matrix:
[[36  5]
 [ 8 11]]


# Feature importance
1. Time seems to be the most crucial feature followed by ejection fraction

In [9]:
preprocessor = full_pipeline.named_steps['preprocessor']
model = full_pipeline.named_steps['classifier']

X_val_transformed = preprocessor.transform(X_val)

feature_names = X_val_transformed.columns

explainer = shap.Explainer(model, X_val_transformed)
shap_values = explainer(X_val_transformed)

shap_importance = np.abs(shap_values.values).mean(axis=0)

shap_df = pd.DataFrame({
    'feature': feature_names,
    'mean_abs_shap': shap_importance
})

shap_df.sort_values(by='mean_abs_shap', ascending=False, inplace=True)

shap_df.reset_index(drop=True, inplace=True)
display(shap_df)

Unnamed: 0,feature,mean_abs_shap
0,basic_num__time,1.372184
1,log_num__serum_creatinine,0.636784
2,basic_num__ejection_fraction,0.606208
3,basic_num__age,0.363512
4,log_num__creatinine_phosphokinase,0.330853
5,bin__diabetes,0.212678
6,bin__anaemia,0.179225
7,log_num__platelets,0.155719
8,sex__sex,0.11766
9,bin__smoking,0.089633


# Summary
In this field, false negatives are more dangerous than false positives, because a false negative could mean missing a high-risk patient possibly resulting in death. Therefore, recall is the key metric.

What I learned from the baseline model:
1. The model is generalizing well. It performed well on the hold-out test set, because the dataset is relatively small, I tested the recall on 5 additional random seeds to ensure it wasn't a lucky split and the results held.
2. The false negative rate is currently around 42.1% on the hold-out set (8 false negatives out of 19 actual positives), which is relatively high and should be further reduced in the final model to avoid missing high-risk patients.
3. The model significantly prioritized numeric features over binary ones. Among all features, time was the most significant factor, based on SHAP values.