In [1]:
# Optuna tuning notebook (conservative setting)
# - Tuning XGBoost and TabNet with Optuna
# - Conservative search (A): n_trials = 30
# - Follow your pipeline: split 70/15/15, StandardScaler fit on train, SMOTE+Tomek on train only
# - Final evaluation uses validation to find threshold and test set for final metrics

# Note: This is a single-file script intended to be run as a Jupyter cell-by-cell notebook.
# If you want .ipynb, copy this code into a new notebook cell blocks.


In [2]:
import os
import time
import json
import numpy as np
import pandas as pd
import torch
import optuna
import xgboost as xgb
from xgboost import XGBClassifier
from pytorch_tabnet.tab_model import TabNetClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report, roc_auc_score, f1_score, accuracy_score,
    precision_score, recall_score, confusion_matrix
)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek
from sklearn.utils import check_random_state
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# === USER EDITABLE CONFIG ===
DATA_PATH = "data/diabetes_binary.csv"  # <-- sesuaikan
SEED = 42
N_TRIALS = 30  # conservative (A)
N_FOLDS = 3
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'  # TabNet device; XGBoost stays CPU

# Reproducibility
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

print(f"PyTorch Version : {torch.__version__}")
print("Using Optuna for Bayesian-style search (TPE) | Conservative setting (n_trials=30)")

if torch.cuda.is_available():
    print(f"TabNet device set to GPU: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA not available; TabNet will run on CPU (XGBoost stays on CPU).")


PyTorch Version : 2.9.1+cu130
Using Optuna for Bayesian-style search (TPE) | Conservative setting (n_trials=30)
TabNet device set to GPU: NVIDIA GeForce RTX 5060 Laptop GPU


In [4]:
# === LOAD DATA ===
print("Loading data...")
df = pd.read_csv(DATA_PATH)
X = df.iloc[:, 1:22].values
y = df.iloc[:, 0].values


Loading data...


In [5]:
# === SPLIT 70/15/15 ===
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=SEED
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=SEED
)

# === SCALING ===
scaler = StandardScaler()
X_train_scaled_before_balance = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# === BALANCE (SMOTE + TOMEK) on train only ===
print(f"Original Train Shape: {X_train.shape}")
print(f"Original Class dist : {np.bincount(y_train.astype(int))}")

smt = SMOTETomek(random_state=SEED, smote=SMOTE(random_state=SEED), tomek=TomekLinks())
X_train_bal, y_train_bal = smt.fit_resample(X_train_scaled_before_balance, y_train)

print(f"SMOTE+Tomek Train Shape   : {X_train_bal.shape}")
print(f"SMOTE+Tomek Class dist    : {np.bincount(y_train_bal.astype(int))}")

# Single stratified split of the balanced train for tuning (train/valid)
X_train_tune, X_val_tune, y_train_tune, y_val_tune = train_test_split(
    X_train_bal, y_train_bal, test_size=0.20, stratify=y_train_bal, random_state=SEED
)

# Utility: threshold search on validation
def find_best_threshold(y_val, y_prob_val):
    thresholds = np.arange(0.1, 0.9, 0.01)
    f1s = [f1_score(y_val, (y_prob_val > th).astype(int)) for th in thresholds]
    best_th = thresholds[np.argmax(f1s)]
    return best_th

# Metrics container
from collections import OrderedDict

def calculate_metrics(y_true, y_pred, y_prob):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    auc = roc_auc_score(y_true, y_prob)
    sensitivity = recall
    specificity = tn / (tn + fp) if (tn + fp) != 0 else 0
    ppv = tp / (tp + fp) if (tp + fp) != 0 else 0
    npv = tn / (tn + fn) if (tn + fn) != 0 else 0
    return OrderedDict({
        'Accuracy': acc,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'ROC-AUC': auc,
        'Sensitivity': sensitivity,
        'Specificity': specificity,
        'PPV': ppv,
        'NPV': npv
    })


Original Train Shape: (177576, 21)
Original Class dist : [152834  24742]
SMOTE+Tomek Train Shape   : (304175, 21)
SMOTE+Tomek Class dist    : [152834 151341]


In [6]:
# === OPTUNA OBJECTIVE FOR XGBOOST ===

def objective_xgb(trial):
    # Suggest hyperparameters
    param = {
        'n_estimators': trial.suggest_categorical('n_estimators', [100, 200, 300]),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
        'objective': 'binary:logistic',
        'tree_method': 'hist',
        'random_state': SEED,
        'n_jobs': 1  # limit inside trials to avoid oversubscription
    }

    clf = XGBClassifier(**param)
    clf.fit(X_train_tune, y_train_tune, verbose=False)
    y_pred = clf.predict(X_val_tune)
    return float(f1_score(y_val_tune, y_pred))


In [7]:
# === OPTUNA OBJECTIVE FOR TABNET ===

def objective_tabnet(trial):
    # Suggest hyperparameters (conservative ranges)
    n_d = trial.suggest_categorical('n_d', [8, 16, 32])
    n_a = trial.suggest_categorical('n_a', [8, 16, 32])
    n_steps = trial.suggest_int('n_steps', 3, 8)
    gamma = trial.suggest_float('gamma', 1.0, 2.0)
    lambda_sparse = trial.suggest_loguniform('lambda_sparse', 1e-4, 1e-2)
    lr = trial.suggest_loguniform('lr', 1e-3, 1e-1)
    batch_size = trial.suggest_categorical('batch_size', [128, 256, 512])

    max_epochs = 50
    patience = 10

    clf = TabNetClassifier(
        n_d=n_d, n_a=n_a, n_steps=n_steps,
        gamma=gamma,
        lambda_sparse=lambda_sparse,
        optimizer_fn=torch.optim.Adam,
        optimizer_params=dict(lr=lr),
        scheduler_fn=torch.optim.lr_scheduler.StepLR,
        scheduler_params={"step_size": 10, "gamma": 0.5},
        mask_type='entmax',
        device_name=DEVICE,
        seed=SEED,
        verbose=0
    )

    clf.fit(
        X_train=X_train_tune, y_train=y_train_tune,
        eval_set=[(X_val_tune, y_val_tune)],
        eval_name=['valid'],
        eval_metric=['auc'],
        max_epochs=max_epochs,
        patience=patience,
        batch_size=batch_size,
        virtual_batch_size=min(64, batch_size),
        num_workers=0,
        drop_last=False
    )

    y_prob = clf.predict_proba(X_val_tune)[:, 1]
    y_pred = (y_prob > 0.5).astype(int)
    score = float(f1_score(y_val_tune, y_pred))

    del clf
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

    return score


In [8]:
# === RUN STUDIES ===

# XGBoost study
print("Starting Optuna study for XGBoost...")
study_xgb = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))
study_xgb.optimize(objective_xgb, n_trials=N_TRIALS, n_jobs=1)

print("Best XGB trial:", study_xgb.best_trial.params)

# Save XGBoost study results
with open('optuna_xgb_best.json', 'w') as f:
    json.dump(study_xgb.best_trial.params, f, indent=2)


[I 2025-12-10 00:34:23,553] A new study created in memory with name: no-name-97185821-e995-4e47-ae07-c3670365359e


Starting Optuna study for XGBoost...


[I 2025-12-10 00:34:31,047] Trial 0 finished with value: 0.8866379310344827 and parameters: {'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.015958237752949748, 'subsample': 0.662397808134481, 'colsample_bytree': 0.6232334448672797, 'reg_alpha': 0.8661761457749352, 'reg_lambda': 0.6011150117432088}. Best is trial 0 with value: 0.8866379310344827.
[I 2025-12-10 00:34:39,467] Trial 1 finished with value: 0.9069413314626927 and parameters: {'n_estimators': 300, 'max_depth': 9, 'learning_rate': 0.018891200276189388, 'subsample': 0.6727299868828402, 'colsample_bytree': 0.6733618039413735, 'reg_alpha': 0.3042422429595377, 'reg_lambda': 0.5247564316322378}. Best is trial 1 with value: 0.9069413314626927.
[I 2025-12-10 00:34:44,935] Trial 2 finished with value: 0.8873661100392523 and parameters: {'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.023993242906812727, 'subsample': 0.7465447373174767, 'colsample_bytree': 0.7824279936868144, 'reg_alpha': 0.7851759613930136, 'reg_lambd

Best XGB trial: {'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.1179380942098765, 'subsample': 0.9621945045088368, 'colsample_bytree': 0.8900646198797656, 'reg_alpha': 0.5030911215349643, 'reg_lambda': 0.7591065325083481}


In [9]:
# === TRAIN FINAL XGBOOST MODEL AND EVALUATE ===

best_xgb_params = study_xgb.best_trial.params
# map types: ensure required keys exist
xgb_final = XGBClassifier(
    objective='binary:logistic',
    tree_method='hist',
    n_jobs=-1,
    random_state=SEED,
    **best_xgb_params
)

start = time.time()
xgb_final.fit(X_train_bal, y_train_bal)
end = time.time()
print(f"XGBoost final trained in {(end-start)/60:.2f} mins")

# Find best threshold on validation
xgb_prob_val = xgb_final.predict_proba(X_val_scaled)[:, 1]
xgb_best_th = find_best_threshold(y_val, xgb_prob_val)

# Predict on test
y_prob_xgb = xgb_final.predict_proba(X_test_scaled)[:, 1]
y_pred_xgb = (y_prob_xgb > xgb_best_th).astype(int)

xgb_metrics = calculate_metrics(y_test, y_pred_xgb, y_prob_xgb)
print('\n--- XGBoost Metrics ---')
for k, v in xgb_metrics.items():
    print(f"{k:<12}: {v:.4f}")


XGBoost final trained in 0.02 mins

--- XGBoost Metrics ---
Accuracy    : 0.8105
Precision   : 0.3800
Recall      : 0.5696
F1-Score    : 0.4559
ROC-AUC     : 0.8203
Sensitivity : 0.5696
Specificity : 0.8496
PPV         : 0.3800
NPV         : 0.9242


In [None]:
# === RUN TABNET STUDY ===
print("Starting Optuna study for TabNet... (this may take a while)")
study_tab = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))
study_tab.optimize(objective_tabnet, n_trials=N_TRIALS, n_jobs=1)

print("Best TabNet trial:", study_tab.best_trial.params)

# Save TabNet study results
with open('optuna_tabnet_best.json', 'w') as f:
    json.dump(study_tab.best_trial.params, f, indent=2)


[I 2025-12-10 00:36:53,510] A new study created in memory with name: no-name-f5ac8d32-a71c-461c-8de4-cb9f1df7131f


Starting Optuna study for TabNet... (this may take a while)

Early stopping occurred at epoch 21 with best_epoch = 11 and best_valid_auc = 0.8906


[I 2025-12-10 00:47:02,651] Trial 0 finished with value: 0.8075998774213319 and parameters: {'n_d': 16, 'n_a': 8, 'n_steps': 3, 'gamma': 1.866176145774935, 'lambda_sparse': 0.0015930522616241021, 'lr': 0.02607024758370768, 'batch_size': 256}. Best is trial 0 with value: 0.8075998774213319.



Early stopping occurred at epoch 33 with best_epoch = 23 and best_valid_auc = 0.88324


[I 2025-12-10 01:00:09,453] Trial 1 finished with value: 0.7993415795697894 and parameters: {'n_d': 8, 'n_a': 16, 'n_steps': 4, 'gamma': 1.6118528947223796, 'lambda_sparse': 0.00019010245319870352, 'lr': 0.00383962929980417, 'batch_size': 512}. Best is trial 0 with value: 0.8075998774213319.



Early stopping occurred at epoch 18 with best_epoch = 8 and best_valid_auc = 0.89487


[I 2025-12-10 01:06:05,112] Trial 2 finished with value: 0.7992563810544651 and parameters: {'n_d': 32, 'n_a': 16, 'n_steps': 3, 'gamma': 1.9488855372533331, 'lambda_sparse': 0.00853618986286683, 'lr': 0.041380401125610165, 'batch_size': 512}. Best is trial 0 with value: 0.8075998774213319.



Early stopping occurred at epoch 18 with best_epoch = 8 and best_valid_auc = 0.88773


[I 2025-12-10 01:21:38,758] Trial 3 finished with value: 0.8000579588813936 and parameters: {'n_d': 32, 'n_a': 16, 'n_steps': 6, 'gamma': 1.311711076089411, 'lambda_sparse': 0.001096821720752952, 'lr': 0.0123999678368461, 'batch_size': 256}. Best is trial 0 with value: 0.8075998774213319.



Early stopping occurred at epoch 29 with best_epoch = 19 and best_valid_auc = 0.87219


[I 2025-12-10 01:43:38,647] Trial 4 finished with value: 0.7904834007184163 and parameters: {'n_d': 8, 'n_a': 8, 'n_steps': 3, 'gamma': 1.3253303307632645, 'lambda_sparse': 0.0005989003672254305, 'lr': 0.003488976654890368, 'batch_size': 128}. Best is trial 0 with value: 0.8075998774213319.



Early stopping occurred at epoch 15 with best_epoch = 5 and best_valid_auc = 0.87755


[I 2025-12-10 01:53:05,808] Trial 5 finished with value: 0.7962465238398558 and parameters: {'n_d': 32, 'n_a': 16, 'n_steps': 4, 'gamma': 1.0055221171236024, 'lambda_sparse': 0.004274869455295219, 'lr': 0.025924756604751596, 'batch_size': 256}. Best is trial 0 with value: 0.8075998774213319.



Early stopping occurred at epoch 15 with best_epoch = 5 and best_valid_auc = 0.8651


[I 2025-12-10 02:08:27,931] Trial 6 finished with value: 0.7874587668104542 and parameters: {'n_d': 32, 'n_a': 8, 'n_steps': 4, 'gamma': 1.325183322026747, 'lambda_sparse': 0.002878805718308925, 'lr': 0.018841476921545086, 'batch_size': 128}. Best is trial 0 with value: 0.8075998774213319.


Stop training because you reached max_epochs = 50 with best_epoch = 48 and best_valid_auc = 0.86641


[I 2025-12-10 02:58:38,635] Trial 7 finished with value: 0.7936990765888104 and parameters: {'n_d': 16, 'n_a': 8, 'n_steps': 5, 'gamma': 1.025419126744095, 'lambda_sparse': 0.00016435497475111326, 'lr': 0.001155735281626987, 'batch_size': 128}. Best is trial 0 with value: 0.8075998774213319.



Early stopping occurred at epoch 17 with best_epoch = 7 and best_valid_auc = 0.86873


[I 2025-12-10 03:07:08,747] Trial 8 finished with value: 0.7771093273557383 and parameters: {'n_d': 8, 'n_a': 8, 'n_steps': 4, 'gamma': 1.1612212872540044, 'lambda_sparse': 0.007234279845665418, 'lr': 0.04132765459466366, 'batch_size': 256}. Best is trial 0 with value: 0.8075998774213319.



Early stopping occurred at epoch 18 with best_epoch = 8 and best_valid_auc = 0.88317


[I 2025-12-10 03:18:30,176] Trial 9 finished with value: 0.7949607626953759 and parameters: {'n_d': 16, 'n_a': 16, 'n_steps': 3, 'gamma': 1.2279351625419417, 'lambda_sparse': 0.0007148510793512986, 'lr': 0.04325432427964557, 'batch_size': 128}. Best is trial 0 with value: 0.8075998774213319.



Early stopping occurred at epoch 30 with best_epoch = 20 and best_valid_auc = 0.88693


[I 2025-12-10 03:44:32,821] Trial 10 finished with value: 0.7960036868786622 and parameters: {'n_d': 16, 'n_a': 32, 'n_steps': 8, 'gamma': 1.9541218213108225, 'lambda_sparse': 0.0017556088263664706, 'lr': 0.005762168055965823, 'batch_size': 256}. Best is trial 0 with value: 0.8075998774213319.



Early stopping occurred at epoch 25 with best_epoch = 15 and best_valid_auc = 0.88885


[I 2025-12-10 04:03:46,018] Trial 11 finished with value: 0.8030990415335463 and parameters: {'n_d': 16, 'n_a': 32, 'n_steps': 7, 'gamma': 1.643811744770019, 'lambda_sparse': 0.0013093310334961429, 'lr': 0.012151469945526802, 'batch_size': 256}. Best is trial 0 with value: 0.8075998774213319.



Early stopping occurred at epoch 18 with best_epoch = 8 and best_valid_auc = 0.87497


[I 2025-12-10 04:18:40,567] Trial 12 finished with value: 0.7949600473064152 and parameters: {'n_d': 16, 'n_a': 32, 'n_steps': 7, 'gamma': 1.7162871820090468, 'lambda_sparse': 0.0003419653838206277, 'lr': 0.007827949812496831, 'batch_size': 256}. Best is trial 0 with value: 0.8075998774213319.


In [None]:
# === TRAIN FINAL TABNET MODEL AND EVALUATE ===

best_tab = study_tab.best_trial.params
clf_tabnet_final = TabNetClassifier(
    n_d=best_tab.get('n_d', 8),
    n_a=best_tab.get('n_a', 8),
    n_steps=best_tab.get('n_steps', 3),
    gamma=best_tab.get('gamma', 1.3),
    lambda_sparse=best_tab.get('lambda_sparse', 0.001),
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=best_tab.get('lr', 0.02)),
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    scheduler_params={"step_size": 10, "gamma": 0.5},
    mask_type='entmax',
    device_name=DEVICE,
    seed=SEED,
    verbose=1
)

start = time.time()
clf_tabnet_final.fit(
    X_train=X_train_bal, y_train=y_train_bal,
    eval_set=[(X_train_bal, y_train_bal), (X_val_scaled, y_val)],
    eval_name=['train', 'valid'],
    eval_metric=['auc', 'accuracy'],
    max_epochs=200,
    patience=20,
    batch_size=best_tab.get('batch_size', 256),
    virtual_batch_size=min(128, best_tab.get('batch_size', 256)),
    num_workers=0,
    drop_last=False
)
end = time.time()
print(f"TabNet final trained in {(end-start)/60:.2f} mins")

# Evaluate TabNet
tab_prob_val = clf_tabnet_final.predict_proba(X_val_scaled)[:, 1]
tab_best_th = find_best_threshold(y_val, tab_prob_val)

y_prob_tab = clf_tabnet_final.predict_proba(X_test_scaled)[:, 1]
y_pred_tab = (y_prob_tab > tab_best_th).astype(int)

tab_metrics = calculate_metrics(y_test, y_pred_tab, y_prob_tab)
print('\n--- TabNet Metrics ---')
for k, v in tab_metrics.items():
    print(f"{k:<12}: {v:.4f}")


In [None]:
# --- Summary DataFrame ---
metrics_df = pd.DataFrame({
    'Model': ['XGBoost', 'TabNet'],
    'Accuracy': [xgb_metrics['Accuracy'], tab_metrics['Accuracy']],
    'Precision': [xgb_metrics['Precision'], tab_metrics['Precision']],
    'Recall': [xgb_metrics['Recall'], tab_metrics['Recall']],
    'F1-Score': [xgb_metrics['F1-Score'], tab_metrics['F1-Score']],
    'ROC-AUC': [xgb_metrics['ROC-AUC'], tab_metrics['ROC-AUC']],
    'Sensitivity': [xgb_metrics['Sensitivity'], tab_metrics['Sensitivity']],
    'Specificity': [xgb_metrics['Specificity'], tab_metrics['Specificity']],
    'PPV': [xgb_metrics['PPV'], tab_metrics['PPV']],
    'NPV': [xgb_metrics['NPV'], tab_metrics['NPV']]
})

print('\n--- Summary ---')
print(metrics_df)

# Save metrics
metrics_df.to_csv('tuning_metrics_summary.csv', index=False)

print('\nSaved: optuna_xgb_best.json, optuna_tabnet_best.json, tuning_metrics_summary.csv')