In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# === STEP 1: Load Data ===
file_path = r"C:\Users\belss\Downloads\heart+disease\processed.cleveland.data"

# Column names from UCI documentation
columns = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
    "thalach", "exang", "oldpeak", "slope", "ca", "thal", "target"
]

# Load dataset
df = pd.read_csv(file_path, header=None, names=columns)

# === STEP 2: Clean Data ===
# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Convert all columns to numeric (NaN will remain)
df = df.apply(pd.to_numeric, errors='coerce')

# Drop rows with any NaN
df.dropna(inplace=True)

# Convert target: 0 = no disease, >0 = disease
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

# === STEP 3: Split Data ===
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# === STEP 4: Train Model ===
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# === STEP 5: Evaluate Model ===
y_pred = clf.predict(X_test)
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))
print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

# === STEP 6: Save Model ===
joblib.dump(clf, "heart_disease_model.pkl")
print("Model saved as heart_disease_model.pkl")


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.91      0.89      0.90        36
           1       0.84      0.88      0.86        24

    accuracy                           0.88        60
   macro avg       0.88      0.88      0.88        60
weighted avg       0.88      0.88      0.88        60

=== Confusion Matrix ===
[[32  4]
 [ 3 21]]
Model saved as heart_disease_model.pkl
