# Phase 1 & 2: Import Libraries
This section imports all required Python libraries for data loading, exploration, preprocessing, and evaluation.


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)


# Phase 1: Load and Inspect the Dataset
This cell loads the heart disease dataset and performs basic inspection, including:
- shape of the dataset
- column names
- first 5 rows
- datatypes
- missing values
- descriptive statistics
- target distribution


In [4]:
df = pd.read_csv("heart.csv")

print("Data loaded successfully.")
print("Shape (rows, columns):", df.shape)
print("\nColumns:")
print(df.columns.tolist())

print("\nFirst 5 rows:")
display(df.head())

print("\nDataFrame info:")
print(df.info())

print("\nMissing values per column:")
print(df.isna().sum())

print("\nSummary statistics:")
display(df.describe())

print("\nTarget value counts (0 = no disease, 1 = disease):")
print(df["target"].value_counts())
print("\nTarget distribution (normalized):")
print(df["target"].value_counts(normalize=True))


Data loaded successfully.
Shape (rows, columns): (1025, 14)

Columns:
['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

First 5 rows:


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0



DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB
None

Missing values per column:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0



Target value counts (0 = no disease, 1 = disease):
target
1    526
0    499
Name: count, dtype: int64

Target distribution (normalized):
target
1    0.513171
0    0.486829
Name: proportion, dtype: float64


# Phase 2: Prepare the Data for Modeling
In this step we:
- separate features and target variable
- split data into training and testing sets
- scale the features using StandardScaler


In [5]:
X = df.drop("target", axis=1)
y = df["target"]

print("Features and target separated.")
print("X shape:", X.shape)
print("y shape:", y.shape)


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain/Test split completed.")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nScaling completed.")
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)


Features and target separated.
X shape: (1025, 13)
y shape: (1025,)

Train/Test split completed.
X_train shape: (820, 13)
X_test shape: (205, 13)
y_train shape: (820,)
y_test shape: (205,)

Scaling completed.
X_train_scaled shape: (820, 13)
X_test_scaled shape: (205, 13)


# Phase 3 & 4: Model Evaluation Helper Function
This function evaluates a classification model using:
- accuracy
- precision
- recall
- F1 score
- ROC-AUC


In [19]:
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)

    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_proba = None

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None

    print(f"\nEvaluation for: {name}")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1 score : {f1:.4f}")
    print(f"ROC-AUC  : {auc:.4f}" if auc is not None else "ROC-AUC  : Not available")

    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "roc_auc": auc
    }


# Phase 3 & 4: Train and Evaluate Multiple Models
This cell trains three different machine learning models:
1. Naive Bayes  
2. Decision Tree  
3. Random forest
4. logistic regression (to set the baseline)

After training, each model is evaluated using the helper function, and all
results are compared in a summary table.


In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd

results = {}

# Naive Bayes (using unscaled features)
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
results["Naive Bayes"] = evaluate_model("Naive Bayes", nb_model, X_test, y_test)

# Decision Tree (using unscaled features)
dt_model = DecisionTreeClassifier(
    criterion="gini",
    max_depth=4,
    random_state=42
)
dt_model.fit(X_train, y_train)
results["Decision Tree"] = evaluate_model("Decision Tree", dt_model, X_test, y_test)

# Random Forest (using unscaled features)
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    random_state=42
)
rf_model.fit(X_train, y_train)
results["Random Forest"] = evaluate_model("Random Forest", rf_model, X_test, y_test)

# Logistic Regression (using scaled features – works better with scaling)
lr_model = LogisticRegression(
    max_iter=1000,
    random_state=42
)
lr_model.fit(X_train_scaled, y_train)
results["Logistic Regression"] = evaluate_model(
    "Logistic Regression",
    lr_model,
    X_test_scaled,
    y_test
)

# Compare all models
results_df = pd.DataFrame(results).T

print("\nModel comparison:")
display(results_df)



Evaluation for: Naive Bayes
Accuracy : 0.8293
Precision: 0.8070
Recall   : 0.8762
F1 score : 0.8402
ROC-AUC  : 0.9043

Evaluation for: Decision Tree
Accuracy : 0.8390
Precision: 0.8214
Recall   : 0.8762
F1 score : 0.8479
ROC-AUC  : 0.8957

Evaluation for: Random Forest
Accuracy : 0.9659
Precision: 0.9623
Recall   : 0.9714
F1 score : 0.9668
ROC-AUC  : 0.9880

Evaluation for: Logistic Regression
Accuracy : 0.8098
Precision: 0.7619
Recall   : 0.9143
F1 score : 0.8312
ROC-AUC  : 0.9298

Model comparison:


Unnamed: 0,accuracy,precision,recall,f1,roc_auc
Naive Bayes,0.829268,0.807018,0.87619,0.840183,0.904286
Decision Tree,0.839024,0.821429,0.87619,0.847926,0.895667
Random Forest,0.965854,0.962264,0.971429,0.966825,0.988
Logistic Regression,0.809756,0.761905,0.914286,0.831169,0.92981


# Phase 5: Build the Prediction Pipeline
This section creates a function to:
- accept new patient input values
- convert them into the correct feature format
- scale features when needed
- use the selected trained model to predict the presence of heart disease


In [20]:
import numpy as np

def predict_heart_disease(model, input_data, use_scaling=False):
    """
    Predict heart disease from input patient data.

    Parameters:
    - model: trained model to use for prediction
    - input_data: dictionary with feature names and their values
    - use_scaling: set True if the model requires scaled inputs (e.g., Logistic Regression)
    """

    input_df = pd.DataFrame([input_data])

    if use_scaling:
        scaled = scaler.transform(input_df)
        prediction = model.predict(scaled)[0]
        probability = model.predict_proba(scaled)[0][1]
    else:
        prediction = model.predict(input_df)[0]
        probability = model.predict_proba(input_df)[0][1]

    return prediction, probability

example_input = {
    "age": 58,
    "sex": 1,
    "cp": 2,
    "trestbps": 130,
    "chol": 230,
    "fbs": 0,
    "restecg": 1,
    "thalach": 150,
    "exang": 0,
    "oldpeak": 1.2,
    "slope": 2,
    "ca": 0,
    "thal": 2
}

pred, prob = predict_heart_disease(
    model=rf_model,
    input_data=example_input,
    use_scaling=False
)

print("Prediction (0=no disease, 1=disease):", pred)
print("Probability of disease:", prob)


Prediction (0=no disease, 1=disease): 1
Probability of disease: 0.7644721947827268


# Phase 6: Recommendation System
This section builds a rule-based module that:
- interprets model predictions
- generates simple lifestyle recommendations
- adds cautionary guidance for high-risk predictions

These are not medical diagnoses but general health suggestions.


In [17]:
def get_risk_level(probability):
    """
    Map probability (0–1) into 5 risk levels.
    Returns (level, label).
    """
    if probability < 0.20:
        return 1, "Very low risk"
    elif probability < 0.40:
        return 2, "Low risk"
    elif probability < 0.60:
        return 3, "Moderate risk"
    elif probability < 0.80:
        return 4, "High risk"
    else:
        return 5, "Very high risk"


def generate_recommendations(input_data, prediction, probability):
    """
    Generate recommendations based mainly on the predicted probability of heart disease.
    Also adds some extra tips from key clinical values.
    """
    recommendations = []

    risk_level, risk_label = get_risk_level(probability)
    recommendations.append(f"Estimated risk level: {risk_label} ({probability:.2f} probability).")

    # Core recommendations by risk level
    if risk_level == 1:
        # Very low risk
        recommendations.append("Current risk appears very low. Maintain a heart-healthy lifestyle to keep it that way.")
        recommendations.append("Stay active with regular moderate exercise (for example, walking most days of the week).")
        recommendations.append("Continue with routine check-ups according to your doctor’s schedule.")

    elif risk_level == 2:
        # Low risk
        recommendations.append("Risk is slightly elevated compared with very low risk, but still relatively low.")
        recommendations.append("Improve diet quality: more fruits, vegetables, whole grains, and less processed food.")
        recommendations.append("Aim for at least 150 minutes of moderate exercise per week if your doctor agrees.")
        recommendations.append("Discuss heart health at your next regular doctor visit.")

    elif risk_level == 3:
        # Moderate risk
        recommendations.append("Moderate risk detected. It is a good idea to talk with a healthcare professional soon.")
        recommendations.append("Review your blood pressure, cholesterol, and blood sugar with a clinician.")
        recommendations.append("Consider lifestyle changes such as weight management, regular exercise, and limiting alcohol and smoking.")
        recommendations.append("Monitor any chest discomfort, shortness of breath, or unusual fatigue and report it.")

    elif risk_level == 4:
        # High risk
        recommendations.append("High risk detected. You should schedule an appointment with a doctor or cardiologist in the near future.")
        recommendations.append("You may need more detailed tests (such as ECG, stress test, or imaging), as decided by your doctor.")
        recommendations.append("Avoid very intense physical exertion until a clinician has evaluated your condition.")
        recommendations.append("Take any new or worsening chest pain seriously and seek medical advice promptly.")

    elif risk_level == 5:
        # Very high risk
        recommendations.append("Very high risk detected. You should seek medical evaluation as soon as possible.")
        recommendations.append("If you experience chest pain, shortness of breath, pain radiating to arm/jaw, or sudden sweating, treat it as an emergency.")
        recommendations.append("Do not rely on this tool as a diagnosis. Only a licensed professional can diagnose or treat heart disease.")
        recommendations.append("Until you have been evaluated, avoid strenuous activity and follow any existing medical advice you have received.")

    # Additional recommendations based on individual features
    # Cholesterol
    if input_data["chol"] > 240:
        recommendations.append("Cholesterol appears high. Consider reducing saturated fats, fried foods, and added sugars, and increasing fiber intake.")

    # Blood pressure
    if input_data["trestbps"] >= 140:
        recommendations.append("Resting blood pressure is elevated. Reducing salt, managing stress, and staying active can help, under medical supervision.")

    # Fasting blood sugar
    if input_data["fbs"] == 1:
        recommendations.append("Fasting blood sugar > 120 mg/dl. Consider screening for diabetes or prediabetes with your doctor.")

    # Exercise-induced angina
    if input_data["exang"] == 1:
        recommendations.append("Exercise induces chest discomfort. Avoid pushing yourself physically until you have been medically assessed.")

    # Generic healthy habits, always useful
    recommendations.append("Do not start or stop any medication based on this tool. Use it only as a general guide and talk to a healthcare professional.")
    recommendations.append("Avoid smoking, maintain a healthy weight, sleep well, and manage stress to support heart health.")

    return recommendations


In [18]:
import gradio as gr

# Mapping helpers for user-friendly labels
SEX_MAP = {
    "Male": 1,
    "Female": 0,
}

YES_NO_MAP = {
    "Yes": 1,
    "No": 0,
}

CP_MAP = {
    "Typical angina": 0,
    "Atypical angina": 1,
    "Non-anginal pain": 2,
    "Asymptomatic": 3,
}

RESTECG_MAP = {
    "Normal": 0,
    "ST-T wave abnormality": 1,
    "Left ventricular hypertrophy": 2,
}

SLOPE_MAP = {
    "Upsloping": 0,
    "Flat": 1,
    "Downsloping": 2,
}

THAL_MAP = {
    "Normal": 0,
    "Fixed defect": 1,
    "Reversible defect": 2,
}


def ui_predict(
    age,
    sex_label,
    cp_label,
    trestbps,
    chol,
    fbs_label,
    restecg_label,
    thalach,
    exang_label,
    oldpeak,
    slope_label,
    ca,
    thal_label,
    model_name
):
    # Converting user-friendly labels to numeric codes
    input_data = {
        "age": age,
        "sex": SEX_MAP[sex_label],
        "cp": CP_MAP[cp_label],
        "trestbps": trestbps,
        "chol": chol,
        "fbs": YES_NO_MAP[fbs_label],
        "restecg": RESTECG_MAP[restecg_label],
        "thalach": thalach,
        "exang": YES_NO_MAP[exang_label],
        "oldpeak": oldpeak,
        "slope": SLOPE_MAP[slope_label],
        "ca": ca,
        "thal": THAL_MAP[thal_label],
    }

    # Select model and scaling
    if model_name == "Random Forest":
        model = rf_model
        use_scaling = False
    elif model_name == "Logistic Regression":
        model = lr_model
        use_scaling = True
    elif model_name == "Naive Bayes":
        model = nb_model
        use_scaling = False
    elif model_name == "Decision Tree":
        model = dt_model
        use_scaling = False
    else:
        return "Unknown model selected.", ""

    prediction, probability = predict_heart_disease(
        model=model,
        input_data=input_data,
        use_scaling=use_scaling
    )

    if prediction == 1:
        prediction_text = "Model prediction: Heart disease likely."
    else:
        prediction_text = "Model prediction: Heart disease less likely."

    prediction_text += f"\nEstimated probability of disease: {probability:.2f}"

    recs = generate_recommendations(input_data, prediction, probability)
    recommendations_text = "\n".join(f"- {r}" for r in recs)

    return prediction_text, recommendations_text


with gr.Blocks(theme=gr.themes.Soft(primary_hue="purple")) as demo:
    gr.Markdown("## Heart Disease Risk Assistant")
    gr.Markdown(
        "Fill in the patient information below and choose a model. "
        "The assistant will estimate heart disease risk and suggest lifestyle recommendations."
    )

    with gr.Row():
        # Left column: patient profile
        with gr.Column():
            gr.Markdown("### Patient Profile")
            age = gr.Slider(20, 90, value=55, step=1, label="Age (years)")
            sex_label = gr.Radio(
                choices=["Male", "Female"],
                value="Male",
                label="Sex"
            )
            cp_label = gr.Dropdown(
                choices=list(CP_MAP.keys()),
                value="Atypical angina",
                label="Chest pain type"
            )

            gr.Markdown("### Risk Factors")
            fbs_label = gr.Radio(
                choices=["No", "Yes"],
                value="No",
                label="Fasting blood sugar > 120 mg/dl"
            )
            exang_label = gr.Radio(
                choices=["No", "Yes"],
                value="No",
                label="Exercise induced angina"
            )

        # Right column: clinical measurements
        with gr.Column():
            gr.Markdown("### Clinical Measurements")
            trestbps = gr.Slider(
                80, 200, value=130, step=1,
                label="Resting blood pressure (mm Hg)"
            )
            chol = gr.Slider(
                100, 400, value=230, step=1,
                label="Cholesterol (mg/dl)"
            )
            thalach = gr.Slider(
                70, 210, value=150, step=1,
                label="Max heart rate achieved"
            )
            oldpeak = gr.Slider(
                0.0, 6.0, value=1.0, step=0.1,
                label="ST depression (oldpeak)"
            )

            restecg_label = gr.Dropdown(
                choices=list(RESTECG_MAP.keys()),
                value="ST-T wave abnormality",
                label="Resting ECG"
            )
            slope_label = gr.Dropdown(
                choices=list(SLOPE_MAP.keys()),
                value="Flat",
                label="Slope of peak exercise ST segment"
            )
            ca = gr.Slider(
                0, 3, value=0, step=1,
                label="Number of major vessels (0–3)"
            )
            thal_label = gr.Dropdown(
                choices=list(THAL_MAP.keys()),
                value="Reversible defect",
                label="Thalassemia (thal)"
            )

    gr.Markdown("### Model and Results")

    with gr.Row():
        model_name = gr.Dropdown(
            choices=["Random Forest", "Logistic Regression", "Naive Bayes", "Decision Tree"],
            value="Random Forest",
            label="Model"
        )
        predict_button = gr.Button("Run Prediction")

    with gr.Row():
        prediction_output = gr.Textbox(
            label="Prediction",
            lines=3
        )
        recommendations_output = gr.Textbox(
            label="Recommendations",
            lines=10
        )

    predict_button.click(
        fn=ui_predict,
        inputs=[
            age,
            sex_label,
            cp_label,
            trestbps,
            chol,
            fbs_label,
            restecg_label,
            thalach,
            exang_label,
            oldpeak,
            slope_label,
            ca,
            thal_label,
            model_name,
        ],
        outputs=[prediction_output, recommendations_output]
    )

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://878b5125a9689f8888.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


