In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# --- 1. Load the Dataset ---

In [2]:
# Read the data that is inside of the CSV
df = pd.read_csv("./Health_Data/cleaned_health.csv")
df

Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,cp_0,cp_1,cp_2,cp_3
0,52,1,125,212,0,1,168,0,1.0,2,2,3,0,1,0,0,0
1,53,1,140,203,1,0,155,1,3.1,0,0,3,0,1,0,0,0
2,70,1,145,174,0,1,125,1,2.6,0,0,3,0,1,0,0,0
3,61,1,148,203,0,1,161,0,0.0,2,1,3,0,1,0,0,0
4,62,0,138,294,1,1,106,0,1.9,1,3,2,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297,68,0,120,211,0,0,115,0,1.5,1,0,2,1,0,0,1,0
298,44,0,108,141,0,1,175,0,0.6,1,0,2,1,0,0,1,0
299,52,1,128,255,0,1,161,1,0.0,2,1,3,0,1,0,0,0
300,59,1,160,273,0,0,125,0,0.0,2,0,2,0,0,0,0,1


In [3]:
# Define features (X) and target (y) columns
FEATURES = [
    'age', 'sex', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
    'exang', 'oldpeak', 'slope', 'ca', 'thal',
    'cp_0', 'cp_1', 'cp_2', 'cp_3'
]
TARGET_COL = 'target' # The column indicating disease presence (0 or 1)

# Define numerical columns for imputation and scaling
NUMERICAL_COLS_FOR_PROCESSING = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# --- 2. Data Preprocessing ---

In [4]:
print("\n--- Data Preprocessing for Hyperparameter Tuning ---")

# Handle '?' or other non-numeric values if they exist, converting to NaN first
df.replace('?', np.nan, inplace=True)
df.replace('N/A', np.nan, inplace=True)

# Convert all relevant columns to numeric, coercing errors
all_relevant_cols = FEATURES + [TARGET_COL]
for col in all_relevant_cols:
    if col in df.columns:
        if col in NUMERICAL_COLS_FOR_PROCESSING:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        else: # Categorical/binary features including target
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64') # Use nullable integer

# Impute missing values after type conversion
print("Missing values before imputation:")
print(df[all_relevant_cols].isnull().sum()[df[all_relevant_cols].isnull().sum() > 0])

for col in all_relevant_cols:
    if col in df.columns and df[col].isnull().any():
        if col in NUMERICAL_COLS_FOR_PROCESSING:
            median_val = df[col].median()
            df[col].fillna(median_val, inplace=True)
            print(f"Filled missing values in '{col}' with its median ({median_val}).")
        else: # Categorical/binary features including target
            mode_val = df[col].mode()[0]
            df[col].fillna(mode_val, inplace=True)
            print(f"Filled missing values in '{col}' with its mode ({mode_val}).")

print("\nMissing values after imputation:")
print(df[all_relevant_cols].isnull().sum())

# Separate features (X) and target (y)
X = df[FEATURES]
y = df[TARGET_COL]

# Check if target variable has only two unique values (binary classification)
if y.nunique() != 2:
    print(f"Error: The target column '{TARGET_COL}' is not binary. It has {y.nunique()} unique values: {y.unique()}")
    print("Please ensure your 'target' column is binary (e.g., 0 and 1) for classification.")
    exit()

print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts(normalize=True).round(2)}")


--- Data Preprocessing for Hyperparameter Tuning ---
Missing values before imputation:
Series([], dtype: int64)

Missing values after imputation:
age         0
sex         0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
cp_0        0
cp_1        0
cp_2        0
cp_3        0
target      0
dtype: int64

Features (X) shape: (302, 16)
Target (y) shape: (302,)
Target distribution:
1    0.54
0    0.46
Name: target, dtype: Float64


# --- 3. Split Data into Training and Testing Sets ---

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)
print(f"\nTraining set shape: {X_train.shape}, {y_train.shape}")
print(f"Testing set shape: {X_test.shape}, {y_test.shape}")


Training set shape: (226, 16), (226,)
Testing set shape: (76, 16), (76,)


# --- 4. Feature Scaling (for numerical features) ---

In [6]:
scaler = StandardScaler()
numerical_features_in_X = [col for col in NUMERICAL_COLS_FOR_PROCESSING if col in X_train.columns]

if numerical_features_in_X:
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[numerical_features_in_X] = scaler.fit_transform(X_train[numerical_features_in_X])
    X_test_scaled[numerical_features_in_X] = scaler.transform(X_test[numerical_features_in_X])
    print(f"Scaled numerical features: {numerical_features_in_X}")
else:
    print("No numerical features found for scaling. Using original X_train/X_test.")
    X_train_scaled = X_train
    X_test_scaled = X_test

print("\nFirst 5 rows of scaled training features:")
print(X_train_scaled.head())

Scaled numerical features: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

First 5 rows of scaled training features:
         age  sex  trestbps      chol  fbs  restecg   thalach  exang  \
18  0.410557    1  0.515901 -0.676764    1        0  0.611286      0   
90 -0.246722    1 -0.804933 -1.153240    0        0  1.717907      0   
75 -1.123095    1 -0.684857 -0.505233    0        1  0.832610      0   
8  -0.904002    1 -0.684857  0.047479    0        0 -0.318276      0   
28  0.081918    0  2.917417  1.534084    0        2 -1.513427      1   

     oldpeak  slope  ca  thal  cp_0  cp_1  cp_2  cp_3  
18 -0.851400      2   0     2     0     0     1     0  
90 -0.851400      1   0     1     0     0     0     1  
75 -0.851400      2   0     2     0     1     0     0  
8  -0.149150      2   0     3     1     0     0     0  
28  2.133161      1   0     2     1     0     0     0  


# --- 5. Hyperparameter Tuning using GridSearchCV ---

# --- 5.1. Logistic Regression Hyperparameter Tuning ---

In [7]:
print("\n--- Hyperparameter Tuning for Logistic Regression ---")

# Define the parameter grid to search
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100], # Inverse of regularization strength
    'solver': ['liblinear', 'lbfgs'], # Algorithm to use in the optimization problem
    'penalty': ['l1', 'l2'] # Specify the norm of the penalty
}

# Ensure 'l1' penalty is used only with 'liblinear' solver
# 'lbfgs' only supports 'l2' penalty
# We will filter combinations if needed or use a more robust grid
# For simplicity, we'll let GridSearchCV handle invalid combinations or specify compatible ones.
# A common practice is to use 'liblinear' for l1 and 'lbfgs' for l2.

# Create a Logistic Regression model instance
log_reg = LogisticRegression(random_state=42, max_iter=1000) # Increased max_iter for convergence

# Create GridSearchCV object
grid_search_lr = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid_lr,
    cv=5, # 5-fold cross-validation
    scoring='roc_auc', # Metric to optimize (ROC AUC is good for imbalanced classification)
    n_jobs=-1, # Use all available CPU cores
    verbose=1 # Print progress messages
)

# Fit GridSearchCV to the training data
grid_search_lr.fit(X_train_scaled, y_train)

print("\nBest parameters for Logistic Regression:", grid_search_lr.best_params_)
print("Best ROC AUC score for Logistic Regression:", grid_search_lr.best_score_)

# Get the best Logistic Regression model
best_log_reg_model = grid_search_lr.best_estimator_

# Evaluate the best Logistic Regression model on the test set
y_pred_best_lr = best_log_reg_model.predict(X_test_scaled)
y_proba_best_lr = best_log_reg_model.predict_proba(X_test_scaled)[:, 1]

print("\nBest Logistic Regression Model Performance on Test Set:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_lr):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_best_lr):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_best_lr):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_best_lr):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba_best_lr):.4f}")
print("\nConfusion Matrix (Best Logistic Regression):")
print(confusion_matrix(y_test, y_pred_best_lr))
print("\nClassification Report (Best Logistic Regression):")
print(classification_report(y_test, y_pred_best_lr))


--- Hyperparameter Tuning for Logistic Regression ---
Fitting 5 folds for each of 20 candidates, totalling 100 fits

Best parameters for Logistic Regression: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
Best ROC AUC score for Logistic Regression: 0.9086571428571428

Best Logistic Regression Model Performance on Test Set:
Accuracy: 0.8289
Precision: 0.8333
Recall: 0.8537
F1-Score: 0.8434
ROC AUC Score: 0.8885

Confusion Matrix (Best Logistic Regression):
[[28  7]
 [ 6 35]]

Classification Report (Best Logistic Regression):
              precision    recall  f1-score   support

         0.0       0.82      0.80      0.81        35
         1.0       0.83      0.85      0.84        41

    accuracy                           0.83        76
   macro avg       0.83      0.83      0.83        76
weighted avg       0.83      0.83      0.83        76



25 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\lualg\anaconda3\envs\streamlit_pycaret\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\lualg\anaconda3\envs\streamlit_pycaret\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\lualg\anaconda3\envs\streamlit_pycaret\Lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solve

# --- 5.2. Random Forest Classifier Hyperparameter Tuning ---

In [8]:
print("\n--- Hyperparameter Tuning for Random Forest Classifier ---")

# Define the parameter grid to search for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200], # Number of trees in the forest
    'max_depth': [None, 10, 20], # Maximum depth of the tree
    'min_samples_split': [2, 5, 10], # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4], # Minimum number of samples required to be at a leaf node
    'criterion': ['gini', 'entropy'] # Function to measure the quality of a split
}

# Create a Random Forest Classifier model instance
rf_clf = RandomForestClassifier(random_state=42, class_weight='balanced') # class_weight for imbalance

# Create GridSearchCV object
grid_search_rf = GridSearchCV(
    estimator=rf_clf,
    param_grid=param_grid_rf,
    cv=5, # 5-fold cross-validation
    scoring='roc_auc', # Metric to optimize
    n_jobs=-1, # Use all available CPU cores
    verbose=1 # Print progress messages
)

# Fit GridSearchCV to the training data
grid_search_rf.fit(X_train_scaled, y_train)

print("\nBest parameters for Random Forest Classifier:", grid_search_rf.best_params_)
print("Best ROC AUC score for Random Forest Classifier:", grid_search_rf.best_score_)

# Get the best Random Forest model
best_rf_model = grid_search_rf.best_estimator_

# Evaluate the best Random Forest model on the test set
y_pred_best_rf = best_rf_model.predict(X_test_scaled)
y_proba_best_rf = best_rf_model.predict_proba(X_test_scaled)[:, 1]

print("\nBest Random Forest Classifier Model Performance on Test Set:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_rf):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_best_rf):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_best_rf):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred_best_rf):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba_best_rf):.4f}")
print("\nConfusion Matrix (Best Random Forest):")
print(confusion_matrix(y_test, y_pred_best_rf))
print("\nClassification Report (Best Random Forest):")
print(classification_report(y_test, y_pred_best_rf))


--- Hyperparameter Tuning for Random Forest Classifier ---
Fitting 5 folds for each of 162 candidates, totalling 810 fits

Best parameters for Random Forest Classifier: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best ROC AUC score for Random Forest Classifier: 0.9181396825396826

Best Random Forest Classifier Model Performance on Test Set:
Accuracy: 0.7895
Precision: 0.7907
Recall: 0.8293
F1-Score: 0.8095
ROC AUC Score: 0.8739

Confusion Matrix (Best Random Forest):
[[26  9]
 [ 7 34]]

Classification Report (Best Random Forest):
              precision    recall  f1-score   support

         0.0       0.79      0.74      0.76        35
         1.0       0.79      0.83      0.81        41

    accuracy                           0.79        76
   macro avg       0.79      0.79      0.79        76
weighted avg       0.79      0.79      0.79        76



In [9]:
print("\nHyperparameter tuning complete. Best models and their performance on the test set are displayed.")


Hyperparameter tuning complete. Best models and their performance on the test set are displayed.
