In [8]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

In [2]:
df = pd.read_csv(r"C:\Users\DELL\Desktop\Depi\Heart_Disease_Project\data\heart_disease_clean.csv")

In [3]:
from sklearn.model_selection import train_test_split

X = df.drop("num", axis=1)
y = df["num"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [4]:
param_grids = {
    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "solver": ["liblinear", "lbfgs"]
    },
    "Decision Tree": {
        "max_depth": [None, 5, 10, 20],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    },
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    },
    "SVM": {
        "C": [0.1, 1, 10],
        "kernel": ["linear", "rbf"],
        "gamma": ["scale", "auto"]
    }
}

In [7]:
base_models = {
    "Logistic Regression": LogisticRegression(max_iter=500, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(probability=True, random_state=42)
}

# Dictionaries to store results
best_models = {}
baseline_scores = {}
tuned_scores = {}

In [9]:
for name, model in base_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    baseline_scores[name] = accuracy_score(y_test, y_pred)

In [10]:
for name, model in base_models.items():
    grid_search = GridSearchCV(estimator=model,
                               param_grid=param_grids[name],
                               scoring='accuracy',
                               cv=5,
                               n_jobs=-1,
                               verbose=1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    
    # Evaluate tuned model
    y_pred = best_models[name].predict(X_test)
    tuned_scores[name] = accuracy_score(y_test, y_pred)
    
    print(f"Best Params for {name}: {grid_search.best_params_}")
    print(f"Baseline Accuracy: {baseline_scores[name]:.4f}  Tuned Accuracy: {tuned_scores[name]:.4f}\n")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Params for Logistic Regression: {'C': 0.1, 'solver': 'lbfgs'}
Baseline Accuracy: 0.6066  Tuned Accuracy: 0.6066

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Params for Decision Tree: {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 10}
Baseline Accuracy: 0.4590  Tuned Accuracy: 0.4918

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Params for Random Forest: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Baseline Accuracy: 0.5410  Tuned Accuracy: 0.5410

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Params for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Baseline Accuracy: 0.5574  Tuned Accuracy: 0.5410



In [11]:
rand_params = {
    "n_estimators": randint(50, 300),
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": randint(2, 10),
    "min_samples_leaf": randint(1, 5)
}

rand_search = RandomizedSearchCV(RandomForestClassifier(random_state=42),
                                 param_distributions=rand_params,
                                 n_iter=20,
                                 cv=5,
                                 scoring="accuracy",
                                 n_jobs=-1,
                                 random_state=42,
                                 verbose=1)

rand_search.fit(X_train, y_train)
print("RandomizedSearchCV Best Params (Random Forest):", rand_search.best_params_)
print("RandomizedSearchCV Best Accuracy:", accuracy_score(y_test, rand_search.best_estimator_.predict(X_test)))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomizedSearchCV Best Params (Random Forest): {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 9, 'n_estimators': 269}
RandomizedSearchCV Best Accuracy: 0.5573770491803278


In [12]:
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

best_model = best_models["Logistic Regression"]

pipeline = Pipeline([
    ('scaler', StandardScaler()),   
    ('model', best_model)
])

# Fit pipeline on full training data
pipeline.fit(X_train, y_train)

# Save model as .pkl
joblib.dump(pipeline, "best_model.pkl")
print("Model saved as best_model.pkl")

Model saved as best_model.pkl
