This notebook provides details about my preprocessing steps for the heart_failure_clinical_records_dataset used in [DonDie.ai](https://github.com/AvitBrian/DonDie.ai).

In [2]:
!pip install tensorflow==2.17.0

import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from flask import Flask, render_template, request


def load_data(url):
    """Load the dataset from a given URL."""
    return pd.read_csv(url)

def preprocess_features(df):
    """Preprocess features by dropping specified columns and handling numerical columns."""
    X = df.drop('DEATH_EVENT', axis=1)
    y = df['DEATH_EVENT']
    X_reduced = X.drop(['sex', 'serum_sodium', 'smoking', 'anaemia'], axis=1)
    numerical_cols = X_reduced.select_dtypes(include=['number']).columns.tolist()
    return X_reduced, y, numerical_cols

def build_preprocessor(numerical_cols):
    """Build a preprocessing pipeline for numerical features."""
    return ColumnTransformer(
        transformers=[
            ('num', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())
            ]), numerical_cols)
        ]
    )

def split_data(X, y):
    """Split the dataset into training, validation, and test sets."""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify=y_train)
    return X_train, X_val, X_test, y_train, y_val, y_test

def apply_smote(X_train_preprocessed, y_train):
    """Apply SMOTE to the training data."""
    smote = SMOTE(random_state=42)
    return smote.fit_resample(X_train_preprocessed, y_train)

def build_model(input_shape):
    """Build and compile the neural network model."""
    model = models.Sequential([
        layers.Dense(64, activation='relu', input_shape=(input_shape,), kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

def train_model(model, X_train_resampled, y_train_resampled, X_val_preprocessed, y_val):
    """Train the neural network model."""
    return model.fit(X_train_resampled, y_train_resampled,
                     validation_data=(X_val_preprocessed, y_val),
                     epochs=5000,
                     batch_size=32,
                     callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)])

def evaluate_model(model, X_test_preprocessed, y_test):
    """Evaluate the model on the test set."""
    test_loss, test_accuracy = model.evaluate(X_test_preprocessed, y_test)
    return test_accuracy

def save_objects(preprocessor, model):
    """Save the preprocessor and model to disk."""
    joblib.dump(preprocessor, 'preprocessor.pkl')
    model.save('model.h5')
    joblib.dump(model, 'model.pkl')


# Main execution
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv'
df = load_data(url)
X_reduced, y, numerical_cols = preprocess_features(df)
preprocessor = build_preprocessor(numerical_cols)

print(X_reduced.head())

X_train, X_val, X_test, y_train, y_val, y_test = split_data(X_reduced, y)

# Apply preprocessing
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_val_preprocessed = preprocessor.transform(X_val)
X_test_preprocessed = preprocessor.transform(X_test)

# Apply SMOTE
X_train_resampled, y_train_resampled = apply_smote(X_train_preprocessed, y_train)

# Define and train the model
model = build_model(X_train_resampled.shape[1])
history = train_model(model, X_train_resampled, y_train_resampled, X_val_preprocessed, y_val)

# Evaluate the model
test_accuracy = evaluate_model(model, X_test_preprocessed, y_test)
print(f'Test Accuracy: {test_accuracy:.4f}')

# Save the preprocessor and model
save_objects(preprocessor, model)


    age  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0                       582         0                 20   
1  55.0                      7861         0                 38   
2  65.0                       146         0                 20   
3  50.0                       111         0                 20   
4  65.0                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  time  
0                    1  265000.00               1.9     4  
1                    0  263358.03               1.1     6  
2                    0  162000.00               1.3     7  
3                    0  210000.00               1.9     7  
4                    0  327000.00               2.7     8  
Epoch 1/5000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - accuracy: 0.5389 - loss: 1.2632 - val_accuracy: 0.4667 - val_loss: 1.2371
Epoch 2/5000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5439 - loss: 1.2531 - val_accuracy: 0.5833 - val_loss: 1.1856
Epoch 3/5000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6127 - loss: 1.1747 - val_accuracy: 0.6667 - val_loss: 1.1369
Epoch 4/5000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5681 - loss: 1.1725 - val_accuracy: 0.7167 - val_loss: 1.0978
Epoch 5/5000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6654 - loss: 1.1028 - val_accuracy: 0.7500 - val_loss: 1.0627
Epoch 6/5000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6516 - loss: 1.0660 - val_accuracy: 0.7833 - val_loss: 1.0297
Epoch 7/5000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━



Test Accuracy: 0.7333
