In [1]:
# ===== Imports =====
import pandas as pd
import numpy as np
import joblib
import os

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# ===== Load data =====
df = pd.read_csv("data/heart_disease_selected.csv")

# تحويل target إلى binary
df["target"] = df["target"].apply(lambda x: 1 if x > 0 else 0)

X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

os.makedirs("models", exist_ok=True)
os.makedirs("results", exist_ok=True)

# ===== Define parameter grids =====
param_grids = {
    "LogisticRegression": {
        "model": LogisticRegression(max_iter=1000, solver="liblinear", random_state=42),
        "params": {
            "C": [0.01, 0.1, 1, 10, 100],
            "penalty": ["l1", "l2"]
        }
    },
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": [None, 5, 10, 20],
            "min_samples_split": [2, 5, 10]
        }
    },
    "SVM": {
        "model": SVC(probability=True, random_state=42),
        "params": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"],
            "gamma": ["scale", "auto"]
        }
    }
}

# ===== Hyperparameter Tuning =====
best_models = {}
results = {}

for name, config in param_grids.items():
    print(f"===== {name} =====")
    grid = GridSearchCV(
        config["model"], config["params"],
        cv=5, scoring="roc_auc", n_jobs=-1, verbose=1
    )
    grid.fit(X_train, y_train)
    
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    
    auc = roc_auc_score(y_test, y_proba)
    
    print("Best Params:", grid.best_params_)
    print("ROC AUC on test set:", auc)
    print(classification_report(y_test, y_pred))
    
    # Save best model
    joblib.dump(best_model, f"models/{name}_best.pkl")
    
    best_models[name] = best_model
    results[name] = {
        "best_params": grid.best_params_,
        "auc": auc
    }

# ===== Save results =====
import json
with open("results/hyperparameter_tuning.json", "w") as f:
    json.dump(results, f, indent=4)

print("Hyperparameter tuning finished. Best models and results saved.")


===== LogisticRegression =====
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Params: {'C': 1, 'penalty': 'l2'}
ROC AUC on test set: 0.9420995670995671
              precision    recall  f1-score   support

           0       0.90      0.85      0.88        33
           1       0.83      0.89      0.86        28

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61

===== RandomForest =====
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Params: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 100}
ROC AUC on test set: 0.9312770562770563
              precision    recall  f1-score   support

           0       0.88      0.85      0.86        33
           1       0.83      0.86      0.84        28

    accuracy                           0.85        61
   macro avg       0.85      0.85      0.85        61
weighted avg       0.85     