In [2]:

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# Load the dataset (modify the path to your local file)
data = pd.read_csv("heart_failure_clinical_records_dataset.csv")

# Define feature columns and target variable
X = data.drop(columns="DEATH_EVENT")
y = data["DEATH_EVENT"]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create and train machine learning models with hyperparameter tuning

models = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression()
}

# Hyperparameter tuning using GridSearchCV
param_grid = {
    "Decision Tree": {"max_depth": [None, 5, 10, 20], "min_samples_split": [2, 5, 10]},
    "Random Forest": {"n_estimators": [100, 200, 300], "max_depth": [None, 10, 20]},
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "Logistic Regression": {"C": [0.1, 1, 10], "penalty": ["l1", "l2"]}
}

results = {}
for model_name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[model_name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # Evaluate the best model with cross-validation
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
    accuracy = cv_scores.mean()
    precision = cross_val_score(best_model, X_train, y_train, cv=5, scoring=make_scorer(precision_score))
    recall = cross_val_score(best_model, X_train, y_train, cv=5, scoring=make_scorer(recall_score))
    f1 = cross_val_score(best_model, X_train, y_train, cv=5, scoring=make_scorer(f1_score))

    results[model_name] = {
        "Best Parameters": grid_search.best_params_,
        "Accuracy": accuracy,
        "Precision": precision.mean(),
        "Recall": recall.mean(),
        "F1 Score": f1.mean()
    }

# Print the model performance metrics after hyperparameter tuning and cross-validation
for model_name, metrics in results.items():
    print(f"{model_name} Metrics:")
    for metric_name, value in metrics.items():
        if isinstance(value, dict):  # Handling dictionaries within the metrics
            print(f"{metric_name}: {value}")
        else:
            print(f"{metric_name}: {value:.2f}")

Decision Tree Metrics:
Best Parameters: {'max_depth': 5, 'min_samples_split': 2}
Accuracy: 0.81
Precision: 0.72
Recall: 0.63
F1 Score: 0.67
Random Forest Metrics:
Best Parameters: {'max_depth': 20, 'n_estimators': 100}
Accuracy: 0.88
Precision: 0.85
Recall: 0.79
F1 Score: 0.82
SVM Metrics:
Best Parameters: {'C': 1, 'kernel': 'linear'}
Accuracy: 0.82
Precision: 0.72
Recall: 0.65
F1 Score: 0.68
Logistic Regression Metrics:
Best Parameters: {'C': 1, 'penalty': 'l2'}
Accuracy: 0.83
Precision: 0.76
Recall: 0.62
F1 Score: 0.67


15 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

