<a href="https://colab.research.google.com/github/AnanthSundarrajan/heart-failure-prediction/blob/main/HPE_Streamlit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Heart Failure Prediction - User Experience Source code

## Step 1 - Load Data set

In [1]:
#Download the data set "heart-failure-prediction" from Kaggle (https://www.kaggle.com/datasets/fedesoriano/heart-failure-prediction) and load the dataset into a pandas dataframe
import kagglehub

# Download latest version
path = kagglehub.dataset_download("fedesoriano/heart-failure-prediction")

import pandas as pd
import os

# Load the dataset into a pandas DataFrame
df = pd.read_csv(os.path.join(path, 'heart.csv'))

Using Colab cache for faster access to the 'heart-failure-prediction' dataset.


## Step 2 - Data Pre-Processing

In [2]:
#Step 3.1 - Encode the "Categorical Columns" ('Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope') using "One-hot encoding"

# Define categorical and numerical columns
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

# Exclude 'HeartDisease' from the columns to be one-hot encoded as it's the target variable
categorical_cols_for_encoding = [col for col in categorical_cols if col != 'HeartDisease']

df_encoded = pd.get_dummies(df, columns=categorical_cols_for_encoding, drop_first=True)

print("DataFrame after one-hot encoding:")
display(df_encoded.head())
print(f"New shape of DataFrame: {df_encoded.shape}")

DataFrame after one-hot encoding:


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,True,True,False,False,True,False,False,False,True
1,49,160,180,0,156,1.0,1,False,False,True,False,True,False,False,True,False
2,37,130,283,0,98,0.0,0,True,True,False,False,False,True,False,False,True
3,48,138,214,0,108,1.5,1,False,False,False,False,True,False,True,True,False
4,54,150,195,0,122,0.0,0,True,False,True,False,True,False,False,False,True


New shape of DataFrame: (918, 16)


In [3]:
#Step 3.2 - Normalize the "numerical Columns" using the standard scaler function from sklearn
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

print("Numerical columns scaled successfully.")
print("First 5 rows of DataFrame after scaling numerical features:")
display(df_encoded.head())

Numerical columns scaled successfully.
First 5 rows of DataFrame after scaling numerical features:


Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,-1.43314,0.410909,0.82507,-0.551341,1.382928,-0.832432,0,True,True,False,False,True,False,False,False,True
1,-0.478484,1.491752,-0.171961,-0.551341,0.754157,0.105664,1,False,False,True,False,True,False,False,True,False
2,-1.751359,-0.129513,0.770188,-0.551341,-1.525138,-0.832432,0,True,True,False,False,False,True,False,False,True
3,-0.584556,0.302825,0.13904,-0.551341,-1.132156,0.574711,1,False,False,False,False,True,False,True,True,False
4,0.051881,0.951331,-0.034755,-0.551341,-0.581981,-0.832432,0,True,False,True,False,True,False,False,False,True


In [4]:
#Step 3.3 - Split the entire data frame into
# "Target Variable (y)" and assign the "Heart Disease" column to it
# The remaining columns as "Target Features (X)"

target_col = 'HeartDisease'
y = df_encoded[target_col]
X = df_encoded.drop(columns=[target_col])


print("Features (X) and target (y) separated successfully.")
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")

Features (X) and target (y) separated successfully.
Shape of X: (918, 15)
Shape of y: (918,)


In [5]:
#Step 3.4 - Split the data into "Training set" and "Testing Set" for future predictions
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split into training and testing sets successfully.")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Data split into training and testing sets successfully.
Shape of X_train: (734, 15)
Shape of X_test: (184, 15)
Shape of y_train: (734,)
Shape of y_test: (184,)


## Step 3 - Train and Tune the Model using Grandiant Boosting method

In [6]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# Define the parameter grid for Gradient Boosting
param_grid_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

# Instantiate Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Instantiate GridSearchCV
grid_search_gb = GridSearchCV(estimator=gb_classifier, param_grid=param_grid_gb, cv=5, scoring='f1', n_jobs=-1, verbose=1)

# Fit GridSearchCV to the training data
grid_search_gb.fit(X_train, y_train)

# Print best parameters and best F1-score
print("Best parameters found: ", grid_search_gb.best_params_)
print("Best F1-score found: ", grid_search_gb.best_score_)

# Store the best estimator
best_gb_model = grid_search_gb.best_estimator_

# Make predictions with the best model
y_pred = best_gb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best parameters found:  {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}
Best F1-score found:  0.8866408244010522
Accuracy: 0.8804
Precision: 0.8972
Recall: 0.8972
F1-Score: 0.8972


## Step 4 - Create a Streamlit Interface

# Task
Create and deploy a Streamlit application that predicts heart failure using the trained Gradient Boosting model. The application should load the model and scaler, accept user input for heart-related features, preprocess the input, make predictions, and display the results. Configure ngrok using the authentication token '35iYaAZJdMAH8AORHGBTXU0Kx2m_4gqWYZWCA7D9rdQ9jLBTM' to expose the Streamlit application.

## Save Model and Scaler

### Subtask:
Save the trained `best_gb_model` and the `scaler` object to disk using joblib, so they can be loaded into the Streamlit application.


**Reasoning**:
Import the `joblib` library and save the `best_gb_model` and `scaler` objects to disk.



In [7]:
import joblib

# Save the trained Gradient Boosting model
joblib.dump(best_gb_model, 'gradient_boosting_model.joblib')
print("Gradient Boosting model saved as 'gradient_boosting_model.joblib'")

# Save the scaler object
joblib.dump(scaler, 'scaler.joblib')
print("Scaler object saved as 'scaler.joblib'")

Gradient Boosting model saved as 'gradient_boosting_model.joblib'
Scaler object saved as 'scaler.joblib'


## Create Streamlit App (app.py)

### Subtask:
Generate the `app.py` file containing the Streamlit application code. This code will load the saved model and scaler, define input fields for user data, preprocess the input, make predictions, and display the results. It will also define the feature names in the correct order for prediction.


**Reasoning**:
The next step is to generate the Python code for the Streamlit application as a string, incorporating all the necessary imports, model loading, feature definitions, UI elements, and prediction logic as specified in the instructions. This code will then be saved to `app.py`.



In [8]:
app_code = """
import streamlit as st
import pandas as pd
import joblib
import numpy as np

# 1. Load the trained model and scaler
best_gb_model = joblib.load('gradient_boosting_model.joblib')
scaler = joblib.load('scaler.joblib')

# 2. Define categorical and numerical column names used during training
numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']
categorical_cols = {
    'Sex': ['F', 'M'],
    'ChestPainType': ['ASY', 'ATA', 'NAP', 'TA'],
    'RestingECG': ['LVH', 'Normal', 'ST'],
    'ExerciseAngina': ['N', 'Y'],
    'ST_Slope': ['Down', 'Flat', 'Up']
}

# Define the full list of features in the exact order the model expects
# This needs to match the columns of X_train used for model training
model_features = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'Sex_M', 'ChestPainType_ATA', 'ChestPainType_NAP', 'ChestPainType_TA', 'RestingECG_Normal', 'RestingECG_ST', 'ExerciseAngina_Y', 'ST_Slope_Flat', 'ST_Slope_Up']

# 3. Streamlit App Layout
st.set_page_config(page_title="Heart Disease Prediction App created by Ananth Sundarrajan", layout='centered')
st.title("Heart Disease Prediction")
st.write("Enter the patient's details to predict the likelihood of heart disease.")

# Input fields for user data
st.header('Patient Information')

# Numerical inputs
age = st.slider('Age (in years)', 18, 100, 50)
resting_bp = st.slider('Resting Systolic Blood Pressure (mmHg)', 80, 200, 120)
cholesterol = st.slider('Cholesterol (mg/dl)', 100, 600, 200)
fasting_bs = st.selectbox('Fasting Blood Sugar > 120 mg/dl', options=[0, 1], format_func=lambda x: 'Yes' if x == 1 else 'No')
max_hr = st.slider('Maximum Heart Rate Achieved', 60, 220, 150)
oldpeak = st.slider('Oldpeak (ST depression induced by exercise relative to rest found on your ECG)', 0.0, 6.2, 1.0)

# Categorical inputs
sex = st.selectbox('Sex', options=categorical_cols['Sex'])
chest_pain_type = st.selectbox('Chest Pain Type (TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic)', options=categorical_cols['ChestPainType'])
resting_ecg = st.selectbox('Resting ECG Results (Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes criteria)', options=categorical_cols['RestingECG'])
exercise_angina = st.selectbox('Exercise Induced Angina', options=categorical_cols['ExerciseAngina'])
st_slope = st.selectbox('ST_Slope (the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping])', options=categorical_cols['ST_Slope'])

# 4. Prediction Logic
if st.button('Predict Heart Disease'):
    # Collect inputs into a dictionary
    input_data = {
        'Age': age,
        'Sex': sex,
        'ChestPainType': chest_pain_type,
        'RestingBP': resting_bp,
        'Cholesterol': cholesterol,
        'FastingBS': fasting_bs,
        'RestingECG': resting_ecg,
        'MaxHR': max_hr,
        'ExerciseAngina': exercise_angina,
        'Oldpeak': oldpeak,
        'ST_Slope': st_slope
    }

    # Create a DataFrame from the input data
    input_df = pd.DataFrame([input_data])

    # Apply one-hot encoding to categorical features
    # Create dummy columns for all possible categorical values to ensure consistency
    for col, categories in categorical_cols.items():
        for cat in categories:
            if col != 'Sex' or cat != 'F': # For Sex, 'F' is the reference, so Sex_M is created if Sex is M
                if col != 'ChestPainType' or cat != 'ASY':
                    if col != 'RestingECG' or cat != 'LVH':
                        if col != 'ExerciseAngina' or cat != 'N':
                            if col != 'ST_Slope' or cat != 'Down':
                                input_df[f'{col}_{cat}'] = (input_df[col] == cat).astype(int)

    # For columns where drop_first=True was used during training, we need to handle reference categories explicitly.
    # 'Sex_M' is created if Sex is 'M'. 'F' is the reference.
    input_df['Sex_M'] = (input_df['Sex'] == 'M').astype(int)

    # The following categorical columns are dropped after creating their one-hot encoded counterparts.
    # The specific one-hot encoded columns (e.g., ChestPainType_ATA) correspond to the non-reference categories.
    # The reference categories (e.g., ASY for ChestPainType) are implicitly handled by the absence of their one-hot column.
    input_df = input_df.drop(columns=list(categorical_cols.keys()))

    # Ensure all model_features are present, fill missing one-hot encoded columns with 0
    for feature in model_features:
        if feature not in input_df.columns:
            # This primarily handles cases for one-hot encoded columns that were not created
            # because their corresponding categorical value wasn't selected by the user
            # e.g., if 'ChestPainType_TA' does not exist, create it and set to 0
            input_df[feature] = 0

    # Scale numerical features
    input_df[numerical_cols] = scaler.transform(input_df[numerical_cols])

    # Reorder columns to match the model's expected input order
    final_input_df = input_df[model_features]

    # Make prediction
    prediction = best_gb_model.predict(final_input_df)
    prediction_proba = best_gb_model.predict_proba(final_input_df)[:, 1]

    st.subheader('Prediction Result:')
    if prediction[0] == 1:
        st.error(f'Based on the provided information, Heart Disease is Predicted. (Probability: {prediction_proba[0]:.2f})')
    else:
        st.success(f'Based on the provided information, Heart Disease is NOT Predicted. (Probability: {prediction_proba[0]:.2f})')
"""

# Write the app_code string to a file named app.py
with open('app.py', 'w') as f:
    f.write(app_code)

print("Streamlit app code (app.py) generated successfully.")

Streamlit app code (app.py) generated successfully.
