In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
heart_disease=pd.read_csv("heart_disease.csv")
heart_disease

In [None]:
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

In [None]:
# Split into train & test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#  The Baseline: Logistic Regression

In [None]:
# Create and fit the baseline model
baseline_model = LogisticRegression(max_iter=1000)
baseline_model.fit(X_train, y_train)

In [None]:
# Evaluate
baseline_preds = baseline_model.predict(X_test)
baseline_acc = accuracy_score(y_test, baseline_preds)

print(f"Baseline Accuracy: {baseline_acc * 100:.2f}%")

# Improvement 1: Feature Scaling

In [None]:
# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Re-run Logistic Regression on scaled data
scaled_model = LogisticRegression()
scaled_model.fit(X_train_scaled, y_train)

scaled_preds = scaled_model.predict(X_test_scaled)
scaled_acc = accuracy_score(y_test, scaled_preds)

print(f"Scaled Model Accuracy: {scaled_acc * 100:.2f}%")

# Improvement 2: Ensemble Methods (Random Forest)

In [None]:
# Create a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

rf_preds = rf_model.predict(X_test)
rf_acc = accuracy_score(y_test, rf_preds)

print(f"Random Forest Accuracy: {rf_acc * 100:.2f}%")

# Improvement 3: Hyperparameter Tuning

In [None]:
# Define a grid of hyperparameters to search
grid = {
    "n_estimators": [10, 100, 200, 500],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 4, 6],
    "min_samples_leaf": [1, 2, 4]
}
# Fit the tuned model
rs_rf.fit(X_train, y_train)

# Evaluate the best version of the model
tuned_preds = rs_rf.predict(X_test)
tuned_acc = accuracy_score(y_test, tuned_preds)

print(f"Tuned Random Forest Accuracy: {tuned_acc * 100:.2f}%")
print(f"Best Parameters: {rs_rf.best_params_}")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# We'll use our best model
y_preds = rs_rf.predict(X_test)
conf_mat = confusion_matrix(y_test, y_preds)

# Plotting using ConfusionMatrixDisplay
fig, ax = plt.subplots(figsize=(6, 6))
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat, display_labels=rs_rf.classes_)
disp.plot(ax=ax, cmap='Blues', colorbar=False)

plt.title("Confusion Matrix: Tuned Random Forest")
plt.show()

# Comparing Models with ROC Curves

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get the probability of the positive class
baseline_probs = baseline_model.predict_proba(X_test)[:, 1]
rf_probs = rf_model.predict_proba(X_test)[:, 1]
tuned_rf_probs = rs_rf.predict_proba(X_test)[:, 1]