This notebook provides details about my preprocessing steps for the heart_failure_clinical_records_dataset used in [DonDie.ai](https://github.com/AvitBrian/DonDie.ai).

In [3]:
!pip install tensorflow==2.17.0

import pandas as pd
import numpy as np
import joblib
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from flask import Flask, render_template, request


def preprocess_features(X):
    """Preprocess features by dropping specified columns and handling numerical columns."""
    X_reduced = X.drop(['sex', 'serum_sodium', 'smoking', 'anaemia'], axis=1)
    numerical_cols = X_reduced.select_dtypes(include=['number']).columns.tolist()
    return X_reduced, numerical_cols

def build_preprocessor(numerical_cols):
    """Build a preprocessing pipeline for numerical features."""
    return ColumnTransformer(
        transformers=[
            ('num', Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='mean')),
                ('scaler', StandardScaler())
            ]), numerical_cols)
        ]
    )

def apply_preprocessing(preprocessor, X_train, X_val, X_test):
    """Apply preprocessing to the feature sets."""
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_val_preprocessed = preprocessor.transform(X_val)
    X_test_preprocessed = preprocessor.transform(X_test)
    return X_train_preprocessed, X_val_preprocessed, X_test_preprocessed

def apply_smote(X_train_preprocessed, y_train):
    """Apply SMOTE to the training data."""
    smote = SMOTE(random_state=42)
    return smote.fit_resample(X_train_preprocessed, y_train)


# Main execution
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00519/heart_failure_clinical_records_dataset.csv'
df = pd.read_csv(url)

X = df.drop('DEATH_EVENT', axis=1)
y = df['DEATH_EVENT']

X_reduced, numerical_cols = preprocess_features(X)

preprocessor = build_preprocessor(numerical_cols)

print(X_reduced.head())

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42, stratify=y_train)

# Apply preprocessing
X_train_preprocessed, X_val_preprocessed, X_test_preprocessed = apply_preprocessing(preprocessor, X_train, X_val, X_test)

# Apply SMOTE
X_train_resampled, y_train_resampled = apply_smote(X_train_preprocessed, y_train)

# Define and compile the neural network model
def build_model(input_shape):
    model = models.Sequential([
        layers.Dense(64, activation='relu', input_shape=(input_shape,), kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Build and train the neural network model
model = build_model(X_train_resampled.shape[1])
history = model.fit(X_train_resampled, y_train_resampled,
                    validation_data=(X_val_preprocessed, y_val),
                    epochs=5000,
                    batch_size=32,
                    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)])

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test_preprocessed, y_test)
print(f'Test Accuracy: {test_accuracy:.4f}')

# Save the preprocessor and model
joblib.dump(preprocessor, 'preprocessor.pkl')
model.save('model.h5')
joblib.dump(model, 'model.pkl')


    age  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0                       582         0                 20   
1  55.0                      7861         0                 38   
2  65.0                       146         0                 20   
3  50.0                       111         0                 20   
4  65.0                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  time  
0                    1  265000.00               1.9     4  
1                    0  263358.03               1.1     6  
2                    0  162000.00               1.3     7  
3                    0  210000.00               1.9     7  
4                    0  327000.00               2.7     8  
Epoch 1/5000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - accuracy: 0.5264 - loss: 1.2765 - val_accuracy: 0.6000 - val_loss: 1.2413
Epoch 2/5000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.4803 - loss: 1.2746 - val_accuracy: 0.6167 - val_loss: 1.1943
Epoch 3/5000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5056 - loss: 1.2295 - val_accuracy: 0.7167 - val_loss: 1.1533
Epoch 4/5000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5718 - loss: 1.1919 - val_accuracy: 0.7167 - val_loss: 1.1162
Epoch 5/5000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6602 - loss: 1.1454 - val_accuracy: 0.7833 - val_loss: 1.0800
Epoch 6/5000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6774 - loss: 1.0647 - val_accuracy: 0.7833 - val_loss: 1.0446
Epoch 7/5000
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[



Test Accuracy: 0.7500


['model.pkl']