In [1]:
# Data manipulation and analysis
import numpy as np
import pandas as pd

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Hyperparameter optimization
import optuna

# Utility
import pickle
import warnings

In [12]:
# Set up random seed
random_seed=42

# Load the dataset 
df = pd.read_csv("../data/Cardiovascular_Data.csv")
df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [14]:
# Prepare the feature
X = df.drop(["cardio"], axis=1)  
y = df['cardio']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

In [16]:
model = xgb.XGBClassifier().fit(X_train, y_train)

# Make predictions and evaluate
y_test_pred = model.predict(X_test)

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.77      0.74      6988
           1       0.75      0.70      0.73      7012

    accuracy                           0.74     14000
   macro avg       0.74      0.74      0.74     14000
weighted avg       0.74      0.74      0.74     14000



In [17]:
# Define the objective function for Optuna
def objective(trial):
    # Hyperparameter search space
    param = {
        'predictor': 'gpu_predictor',  
        'objective': 'binary:logistic', 
        'eval_metric': 'logloss',  
        'verbosity': 0,
        'enable_categorical': True,
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 3000),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'random_state': random_seed,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'max_delta_step': trial.suggest_int('max_delta_step', 1, 10)
    }

    # Initialize StratifiedKFold cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed)

    auc_scores = []  # Store AUC scores for each fold

    # StratifiedKFold cross-validation loop
    for train_idx, val_idx in cv.split(X_train, y_train):
        X_train_cv, X_val_cv = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_cv, y_val_cv = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # Initialize and train the model
        model = xgb.XGBClassifier(**param)
        model.fit(X_train_cv, y_train_cv, 
                  eval_set=[(X_val_cv, y_val_cv)],
                  verbose=False)

        # Predict and calculate AUC for the fold
        y_pred = model.predict_proba(X_val_cv)[:, 1]  # Get the probability for the positive class
        auc = roc_auc_score(y_val_cv, y_pred)
        auc_scores.append(auc)

    # Return the average AUC from the StratifiedKFold validation
    mean_auc = np.mean(auc_scores)
    return mean_auc

# Create an Optuna study for hyperparameter optimization
study = optuna.create_study(direction='maximize')  # Maximizing AUC score
study.optimize(objective, n_trials=25, show_progress_bar=True)

# After the optimization, you can retrieve the best trial and parameters
best_trial = study.best_trial
print(f"Best trial parameters: {best_trial.params}")

[I 2025-11-11 20:17:48,934] A new study created in memory with name: no-name-9c9adf0e-2939-46ca-a846-29cede71b2c7


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2025-11-11 20:17:53,054] Trial 0 finished with value: 0.8022458140409794 and parameters: {'learning_rate': 0.10763407539125046, 'n_estimators': 471, 'max_depth': 3, 'subsample': 0.5343105035459877, 'colsample_bytree': 0.87465381172171, 'reg_alpha': 8.507803734419277, 'reg_lambda': 6.986173441671124, 'min_child_weight': 6, 'gamma': 1.2216256707968927, 'max_delta_step': 8}. Best is trial 0 with value: 0.8022458140409794.
[I 2025-11-11 20:18:01,039] Trial 1 finished with value: 0.8024392071418414 and parameters: {'learning_rate': 0.11851861055446825, 'n_estimators': 1185, 'max_depth': 12, 'subsample': 0.6414378462710548, 'colsample_bytree': 0.8093361107513153, 'reg_alpha': 2.7818022421601194, 'reg_lambda': 6.812646139636418, 'min_child_weight': 5, 'gamma': 4.531419310724958, 'max_delta_step': 5}. Best is trial 1 with value: 0.8024392071418414.
[I 2025-11-11 20:18:07,749] Trial 2 finished with value: 0.802091059150343 and parameters: {'learning_rate': 0.19317180003014095, 'n_estimators'

In [24]:
model = xgb.XGBClassifier(**best_trial.params).fit(X_train, y_train)

# Make predictions and evaluate
y_test_pred = model.predict(X_test)

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.78      0.75      6988
           1       0.76      0.70      0.73      7012

    accuracy                           0.74     14000
   macro avg       0.74      0.74      0.74     14000
weighted avg       0.74      0.74      0.74     14000



In [25]:
# Save the trained pipeline model
pickle.dump(model, open('cardiovascular_predictor.pkl', 'wb'))

In [26]:
# Load the trained model
loaded_model = pickle.load(open('cardiovascular_predictor.pkl', 'rb'))

# Use the loaded model to make predictions
y_pred = loaded_model.predict(X_test)

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_test_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.78      0.75      6988
           1       0.76      0.70      0.73      7012

    accuracy                           0.74     14000
   macro avg       0.74      0.74      0.74     14000
weighted avg       0.74      0.74      0.74     14000

