In [1]:
# Step 1: Imports
import pandas as pd
import numpy as np
import joblib

from ucimlrepo import fetch_ucirepo

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [2]:
# Step 2: Load raw dataset
heart_disease = fetch_ucirepo(id=45)
X = heart_disease.data.features
y = (heart_disease.data.targets > 0).astype(int).values.ravel()

print("✅ Raw dataset loaded:", X.shape)

✅ Raw dataset loaded: (303, 13)


In [3]:
# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("✅ Train-test split done:", X_train.shape, X_test.shape)

✅ Train-test split done: (242, 13) (61, 13)


In [4]:
# Step 4: Define columns
categorical_cols = ["sex", "cp", "fbs", "restecg", "exang", "slope", "thal"]
numeric_cols = [col for col in X.columns if col not in categorical_cols]

print("Categorical:", categorical_cols)
print("Numeric:", numeric_cols)

Categorical: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']
Numeric: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']


In [5]:
# Step 5: Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler())
        ]), numeric_cols),

        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
        ]), categorical_cols)
    ]
)

In [6]:
# Step 6: Define pipeline (preprocessing + model)
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=200, random_state=42))
])

In [7]:
# Step 7: Train pipeline
pipeline.fit(X_train, y_train)
print("✅ Pipeline trained successfully")

✅ Pipeline trained successfully


In [8]:
# Step 8: Evaluate on test set
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("📊 Test Set Evaluation:")
print(f"Accuracy: {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall: {rec:.3f}")
print(f"F1-score: {f1:.3f}")
print(f"AUC: {auc:.3f}")


📊 Test Set Evaluation:
Accuracy: 0.885
Precision: 0.839
Recall: 0.929
F1-score: 0.881
AUC: 0.957


In [9]:
# Step 9: Save pipeline
joblib.dump(pipeline, "heart_disease_pipeline.pkl")
print("💾 Pipeline saved as heart_disease_pipeline.pkl")

💾 Pipeline saved as heart_disease_pipeline.pkl


In [10]:
# Step 10: Inspect misclassifications
comparison = pd.DataFrame({
    "True Label": y_test,
    "Predicted": y_pred,
    "Prob (Disease)": y_proba.round(3)
})
print("\n🔎 Sample comparison:")
print(comparison.head(15))


🔎 Sample comparison:
    True Label  Predicted  Prob (Disease)
0            0          0           0.305
1            0          1           0.575
2            0          0           0.045
3            0          0           0.035
4            0          0           0.295
5            0          0           0.270
6            0          0           0.335
7            0          0           0.350
8            1          1           0.650
9            0          0           0.175
10           1          1           0.780
11           0          0           0.215
12           0          0           0.010
13           1          1           0.635
14           1          1           0.900


# Random samples to test after deployment 


In [15]:
i = 5  # choose the test sample index
row = X_test.iloc[i]
true_label = y_test[i]   # no .iloc here

print("🔎 Patient features:")
print(row)
print("\nTrue label (0=healthy, 1=diseased):", true_label)

# Convert row to dict for Streamlit
row_dict = row.to_dict()
print("\nPaste this into Streamlit form:\n", row_dict)

🔎 Patient features:
age          60.0
sex           0.0
cp            3.0
trestbps    120.0
chol        178.0
fbs           1.0
restecg       0.0
thalach      96.0
exang         0.0
oldpeak       0.0
slope         1.0
ca            0.0
thal          3.0
Name: 244, dtype: float64

True label (0=healthy, 1=diseased): 0

Paste this into Streamlit form:
 {'age': 60.0, 'sex': 0.0, 'cp': 3.0, 'trestbps': 120.0, 'chol': 178.0, 'fbs': 1.0, 'restecg': 0.0, 'thalach': 96.0, 'exang': 0.0, 'oldpeak': 0.0, 'slope': 1.0, 'ca': 0.0, 'thal': 3.0}
