In [1]:
import numpy as np # type: ignore
import pandas as pd # type: ignore
import matplotlib.pyplot as plt # type: ignore
import seaborn as sns # type: ignore
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, roc_curve
)
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, silhouette_score

In [2]:
data = pd.read_csv("/home/stranger/Desktop/Heart_Disease_Project/results/cleaned_heart_disease.csv")

In [3]:
data["target"] = (data["target"] > 0).astype(int)
X = data.drop(columns=["target"], axis=1).astype(int)
y = data["target"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [4]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

param_grid_lr = [
    {'solver': ['lbfgs'], 'penalty': ['l2', None], 'C': [0.01, 0.1, 1, 10]},
    {'solver': ['saga'], 'penalty': ['l1', 'l2', None], 'C': [0.01, 0.1, 1, 10]},
    {'solver': ['saga'], 'penalty': ['elasticnet'], 'C': [0.01, 0.1, 1, 10], 'l1_ratio': [0, 0.25, 0.5, 0.75, 1]}
]

lr = LogisticRegression(max_iter=5000, class_weight="balanced")
grid_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring="f1", n_jobs=-1, error_score="raise")
grid_lr.fit(X_train, y_train)

print("Best Logistic Regression Params:", grid_lr.best_params_)
print("Best Logistic Regression Score:", grid_lr.best_score_)


from sklearn.ensemble import RandomForestClassifier
import numpy as np

param_dist_rf = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False]
}

rf = RandomForestClassifier(random_state=42, class_weight="balanced")
rand_rf = RandomizedSearchCV(
    rf, param_distributions=param_dist_rf,
    n_iter=20, cv=5, scoring="f1", random_state=42, n_jobs=-1
)
rand_rf.fit(X_train, y_train)

print("Best Random Forest Params:", rand_rf.best_params_)
print("Best Random Forest Score:", rand_rf.best_score_)


best_lr = grid_lr.best_estimator_
best_rf = rand_rf.best_estimator_

for model, name in [(best_lr, "Logistic Regression"), (best_rf, "Random Forest")]:
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]

    print(f"\n{name} Performance on Test Set:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1-Score:", f1_score(y_test, y_pred))
    print("AUC:", roc_auc_score(y_test, y_prob))
     



Best Logistic Regression Params: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best Logistic Regression Score: 0.810140273163529
Best Random Forest Params: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 5, 'bootstrap': True}
Best Random Forest Score: 0.8193443142223631

Logistic Regression Performance on Test Set:
Accuracy: 0.7868852459016393
Precision: 0.7142857142857143
Recall: 0.8928571428571429
F1-Score: 0.7936507936507936
AUC: 0.9177489177489178

Random Forest Performance on Test Set:
Accuracy: 0.8688524590163934
Precision: 0.8125
Recall: 0.9285714285714286
F1-Score: 0.8666666666666667
AUC: 0.9556277056277056


In [5]:
import joblib


best_model = rand_rf.best_estimator_


joblib.dump(best_model, "final_model.pkl")
print("✅ Model saved as final_model.pkl")


loaded_model = joblib.load("final_model.pkl")
y_pred_loaded = loaded_model.predict(X_test)

print("Reloaded Model Accuracy:", accuracy_score(y_test, y_pred_loaded))

✅ Model saved as final_model.pkl
Reloaded Model Accuracy: 0.8688524590163934


In [6]:
import joblib

# Save
joblib.dump(model, "final_model.joblib")

# Load
model = joblib.load("final_model.joblib")
