# Baseline model

The data had multiple linear features making logistic regression a good for baseline interpretability

In [85]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.pipeline")
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sklearn.linear_model import LogisticRegression
import shap
from sklearn.metrics import confusion_matrix

In [86]:
url = "https://raw.githubusercontent.com/CarsonShively/Heart-Failure/refs/heads/main/data/heart_failure.csv"
df = pd.read_csv(url)
df.drop_duplicates(inplace=True)

X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


# Binary features

1. Make all values that are not 0 or 1 classified as < NaN > for imputation.
2. 0 represents the absence of a characteristic, making it a safe default for imputation.
4. Mode is a safe imputation for a sex feature in a baseline model.
3. Convert back to int to remain consistent.

In [87]:
binary_cols = [
    'anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking'
]

zero_impute = [
    'anaemia', 'diabetes', 'high_blood_pressure', 'smoking'
]

class BinaryInt64Cleaner(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, binary_cols):
        self.binary_cols = binary_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.binary_cols:
            X[col] = pd.to_numeric(X[col], errors='coerce').astype('Int64')
            X[col] = X[col].where(X[col].isin([0, 1]), pd.NA)
        return X


class BinaryImputer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, binary_cols, sex_col='sex'):
        self.binary_cols = binary_cols
        self.sex_col = sex_col

    def fit(self, X, y=None):
        self.sex_mode_ = X[self.sex_col].mode(dropna=True)[0]
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.binary_cols:
            X[col] = X[col].fillna(0)
        X[self.sex_col] = X[self.sex_col].fillna(self.sex_mode_)
        return X

class ConvertToInt(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].astype(int)
        return X


binary_pipeline = Pipeline(steps=[
    ('cleaner', BinaryInt64Cleaner(binary_cols=binary_cols)),
    ('imputer', BinaryImputer(binary_cols=zero_impute, sex_col='sex')),
    ('to_int', ConvertToInt(columns=binary_cols))
])
binary_pipeline.set_output(transform="pandas")

# Numeric features
1. Coerce to float so that all NaNs are properly classified as np.nan for imputation.
2. Median is a safe imputation strategy for these numeric features.
3. Apply skew-reducing transformations identified during EDA.
4. Clip outliers at the (0.01) and (0.99) quantiles.
5. Add squared features to capture non-linear relationships.
6. Scale the features for model compatibility and convergence.

In [88]:
numeric_features = [
    'age',
    'creatinine_phosphokinase',
    'ejection_fraction',
    'platelets',
    'serum_creatinine',
    'serum_sodium',
    'time',
]

log_transform_features = [
    'creatinine_phosphokinase',
    'platelets',
    'serum_creatinine'
]

squared_features = [
    'ejection_fraction',
    'serum_creatinine',
    'time'
]

scale_features = [
    'age',
    'creatinine_phosphokinase',
    'ejection_fraction',
    'platelets',
    'serum_creatinine',
    'serum_sodium',
    'time',
    'ejection_fraction_squared',
    'serum_creatinine_squared',
    'time_squared'
]

class CoerceToFloat(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].astype(float)
        return X

class NumericImputer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        X = X.copy()
        self.medians_ = {
            col: X[col].median(skipna=True)
            for col in self.columns
        }
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].fillna(self.medians_[col])
        return X


class LogTransformer(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = np.log1p(X[col])
        return X

class OutlierClipper(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns, lower_quantile=0.01, upper_quantile=0.99):
        self.columns = columns
        self.lower_quantile = lower_quantile
        self.upper_quantile = upper_quantile

    def fit(self, X, y=None):
        X = X.copy()
        self.lower_bounds_ = {
            col: X[col].quantile(self.lower_quantile) for col in self.columns
        }
        self.upper_bounds_ = {
            col: X[col].quantile(self.upper_quantile) for col in self.columns
        }
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].clip(self.lower_bounds_[col], self.upper_bounds_[col])
        return X

class SquaredFeatureAdder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[f'{col}_squared'] = X[col] ** 2
        return pd.DataFrame(X, columns=X.columns, index=X.index)

    def set_output(self, transform=None):
        return self

class Scaler(BaseEstimator, TransformerMixin, OneToOneFeatureMixin):
    def __init__(self, columns):
        self.columns = columns
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns])
        return self

    def transform(self, X):
        X = X.copy()
        X[self.columns] = self.scaler.transform(X[self.columns])
        return X

numeric_pipeline = Pipeline([
    ('float', CoerceToFloat(columns=numeric_features)),
    ('imputer', NumericImputer(columns=numeric_features)),
    ('log_transform', LogTransformer(columns=log_transform_features)),
    ('clip', OutlierClipper(columns=numeric_features)),
    ('squared', SquaredFeatureAdder(columns=squared_features)),
    ('scale', Scaler(columns=scale_features))
])

numeric_pipeline.set_output(transform="pandas")

In [89]:
preprocessor = ColumnTransformer([
    ('bin', binary_pipeline, binary_cols),
    ('num', numeric_pipeline, numeric_features)
])

preprocessor.set_output(transform='pandas')

full_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(class_weight='balanced', random_state=42))
])

full_pipeline.set_output(transform="pandas")

# Evalute model
1. I used a locked cross-validation for a more robust deterministic evaluation.
2. I applied the model to the hold-out test set to assess generalization.
3. I generated feature importance with SHAP to better understand both the model and the data.



In [91]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

scores = cross_validate(
    estimator=full_pipeline,
    X=X_train,
    y=y_train,
    cv=cv,
    scoring=scoring,
    return_train_score=False
)

cv_results = pd.DataFrame(scores)

print("Cross-Validation Performance:")
for metric in scoring:
    mean_score = cv_results[f'test_{metric}'].mean()
    std_score = cv_results[f'test_{metric}'].std()
    print(f"{metric:>10}: {mean_score:.4f} ± {std_score:.4f}")

full_pipeline.fit(X_train, y_train)

y_pred = full_pipeline.predict(X_val)
y_proba = full_pipeline.predict_proba(X_val)[:, 1]

print("Test Set Evaluation:")
print(f"  Accuracy : {accuracy_score(y_val, y_pred):.4f}")
print(f"  Precision: {precision_score(y_val, y_pred):.4f}")
print(f"  Recall   : {recall_score(y_val, y_pred):.4f}")
print(f"  F1 Score : {f1_score(y_val, y_pred):.4f}")
print(f"  ROC AUC  : {roc_auc_score(y_val, y_proba):.4f}")
conf_mat = confusion_matrix(y_val, y_pred)
print("\nConfusion Matrix:")
print(conf_mat)

for seed in [0, 10, 20, 30, 40]:
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=seed
    )
    full_pipeline.fit(X_train, y_train)
    y_pred = full_pipeline.predict(X_val)
    print(f"Seed {seed} - Recall: {recall_score(y_val, y_pred):.4f}")

preprocessor = full_pipeline.named_steps['preprocessor']
model = full_pipeline.named_steps['classifier']

X_val_transformed = preprocessor.transform(X_val)

feature_names = X_val_transformed.columns

explainer = shap.Explainer(model, X_val_transformed)
shap_values = explainer(X_val_transformed)

shap_importance = np.abs(shap_values.values).mean(axis=0)

shap_df = pd.DataFrame({
    'feature': feature_names,
    'mean_abs_shap': shap_importance
})

shap_df.sort_values(by='mean_abs_shap', ascending=False, inplace=True)

shap_df.reset_index(drop=True, inplace=True)
display(shap_df)

Cross-Validation Performance:
  accuracy: 0.8074 ± 0.0406
 precision: 0.6755 ± 0.0787
    recall: 0.8050 ± 0.0447
        f1: 0.7314 ± 0.0394
   roc_auc: 0.8926 ± 0.0413
Test Set Evaluation:
  Accuracy : 0.8500
  Precision: 0.7778
  Recall   : 0.7368
  F1 Score : 0.7568
  ROC AUC  : 0.8986

Confusion Matrix:
[[37  4]
 [ 5 14]]
Seed 0 - Recall: 0.7895
Seed 10 - Recall: 0.7895
Seed 20 - Recall: 0.7895
Seed 30 - Recall: 0.7895
Seed 40 - Recall: 0.7368


Unnamed: 0,feature,mean_abs_shap
0,num__time,1.7248
1,num__ejection_fraction,1.297405
2,num__serum_creatinine,0.835037
3,num__ejection_fraction_squared,0.757297
4,num__serum_creatinine_squared,0.420925
5,num__time_squared,0.406412
6,num__creatinine_phosphokinase,0.329613
7,num__age,0.310409
8,num__serum_sodium,0.237408
9,num__platelets,0.1714


# Summary
In this field, false negatives are more dangerous than false positives, because a false negative could mean missing a high-risk patient possibly resulting in death. Therefore, recall is a key metric.

What I learned from the metrics:
1. The model is generalizing well. It performed well on the hold-out test set, because the dataset is relatively small, I tested the recall on 5 additional random seeds to ensure it wasn't a lucky split.
2. The false negative rate is currently about 26%, which is reasonable, but it should be further optimized in the final model to reduce the number of missed high-risk patients.
3. The model significantly prioritized numeric features over binary ones. Among all features, time was the most significant factor, based on SHAP values.