
# Final Heart Disease Prediction Project

This notebook uses your provided dataset (1190 rows) to train ML models for heart disease prediction.


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay)
from sklearn.impute import SimpleImputer
import joblib

pd.set_option('display.max_columns', None)

DATA_PATH = Path("data/heart.csv")
RANDOM_STATE = 42

# Load dataset
df = pd.read_csv(DATA_PATH)
print("Original shape:", df.shape)
df.head()



## 1) Data Cleaning  
Your dataset has slightly different column names. Let's rename them to a standardized format.


In [None]:

rename_map = {
    "age": "age",
    "sex": "sex",
    "chest pain type": "chest_pain_type",
    "resting bp s": "resting_bp",
    "cholesterol": "serum_cholesterol",
    "fasting blood sugar": "fasting_blood_sugar",
    "resting ecg": "resting_ecg",
    "max heart rate": "max_heart_rate",
    "exercise angina": "exercise_angina",
    "oldpeak": "oldpeak",
    "ST slope": "st_slope",
    "target": "target"
}
df = df.rename(columns=rename_map)
print("Renamed columns:", df.columns.tolist())
df.head()



## 2) Exploratory Data Analysis (EDA)


In [None]:

print("Shape:", df.shape)
print("\nClass balance:")
print(df['target'].value_counts(normalize=True).round(3))

display(df.describe())


In [None]:

# Age distribution
df['age'].hist(bins=20)
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()



## 3) Preprocessing & Train-Test Split


In [None]:

X = df.drop(columns=['target'])
y = df['target']

numeric_features = ['age', 'resting_bp', 'serum_cholesterol', 'max_heart_rate', 'oldpeak']
categorical_features = ['sex', 'chest_pain_type', 'fasting_blood_sugar', 'resting_ecg', 'exercise_angina', 'st_slope']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)
X_train.shape, X_test.shape



## 4) Train Logistic Regression


In [None]:

log_reg = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', LogisticRegression(max_iter=300, class_weight='balanced', random_state=RANDOM_STATE))
])

log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)
y_prob_lr = log_reg.predict_proba(X_test)[:,1]

metrics_lr = {
    'accuracy': accuracy_score(y_test, y_pred_lr),
    'precision': precision_score(y_test, y_pred_lr),
    'recall': recall_score(y_test, y_pred_lr),
    'f1': f1_score(y_test, y_pred_lr),
    'roc_auc': roc_auc_score(y_test, y_prob_lr)
}
metrics_lr



## 5) Train RandomForest


In [None]:

rf = Pipeline(steps=[
    ('preprocess', preprocess),
    ('clf', RandomForestClassifier(n_estimators=300, class_weight='balanced', random_state=RANDOM_STATE))
])

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:,1]

metrics_rf = {
    'accuracy': accuracy_score(y_test, y_pred_rf),
    'precision': precision_score(y_test, y_pred_rf),
    'recall': recall_score(y_test, y_pred_rf),
    'f1': f1_score(y_test, y_pred_rf),
    'roc_auc': roc_auc_score(y_test, y_prob_rf)
}
metrics_rf



## 6) Evaluation Plots


In [None]:

# ROC curve
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)

plt.plot(fpr_lr, tpr_lr, label=f"LogReg (AUC={metrics_lr['roc_auc']:.3f})")
plt.plot(fpr_rf, tpr_rf, label=f"RF (AUC={metrics_rf['roc_auc']:.3f})")
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.show()


In [None]:

# Confusion matrices
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_lr)
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_rf)
plt.title("Confusion Matrix - RandomForest")
plt.show()



## 7) Export Best Model


In [None]:

best_model = rf if metrics_rf['roc_auc'] >= metrics_lr['roc_auc'] else log_reg
best_name = "RandomForest" if best_model is rf else "LogisticRegression"
print("Best model:", best_name)

MODEL_PATH = Path("data/model.joblib")
joblib.dump(best_model, MODEL_PATH)
print("Saved model to", MODEL_PATH)



## 8) Inference Helper


In [None]:

def predict_one(patient_dict):
    model = joblib.load(MODEL_PATH)
    X_one = pd.DataFrame([patient_dict])
    prob = model.predict_proba(X_one)[0,1]
    label = int(prob >= 0.5)
    return prob, label

# Example usage:
example = {
    'age': 54,
    'sex': 1,
    'chest_pain_type': 4,
    'resting_bp': 140,
    'serum_cholesterol': 260,
    'fasting_blood_sugar': 0,
    'resting_ecg': 1,
    'max_heart_rate': 150,
    'exercise_angina': 1,
    'oldpeak': 1.2,
    'st_slope': 2
}
predict_one(example)
