In [2]:
!pip install optuna



### Imports

In [38]:
import pandas as pd
from collections import Counter
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
import numpy as np
from optuna.storages import RDBStorage
import joblib
import json

### Importing the dataset

In [15]:
dataset=pd.read_csv('/kaggle/input/dataset-pred/classification_dataset.csv', sep=',')
X=dataset.drop('truth', axis=1)
Y=dataset['truth']

#showing class distribution
counter=Counter(Y)

print(f"'0' CLASS PERCENTAGE:{counter[0]/(counter[0]+counter[1])*100: 5.1f}%")
print(f"'1' CLASS PERCENTAGE:{counter[1]/(counter[0]+counter[1])*100: 5.1f}%")



X_Train, X_test, Y_Train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

'0' CLASS PERCENTAGE: 24.2%
'1' CLASS PERCENTAGE: 75.8%


### Hyperparameter Tuning

In [5]:
storage = RDBStorage("sqlite:////kaggle/working/XGB__optuna_study.db")

def objective(trial):
    # Spazio di ricerca iperparametri
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'booster': 'gbtree',
        'device': 'cuda',
        'random_state': 42,
        'scale_pos_weight':0.3, #sum(negative cases) / sum(positive cases)


        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.1, 10, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1, 10, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1, 10, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2500),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10)

    }

    # Modello XGBoost
    model = XGBClassifier(**params)

    # Cross-validation
    scores = cross_val_score(model, X_Train, Y_Train, cv=5, scoring='roc_auc', n_jobs=-1)
    return np.mean(scores)

# Inizializza e ottimizza
study = optuna.create_study(
    study_name='XGBoost_Optimization',
    storage=storage,
    load_if_exists=True,
    direction='maximize')
study.optimize(objective, n_trials=70, timeout=3600)  # 50 trial o max 1 ora

# Migliori parametri trovati
print("Best parameters:")
print(study.best_params)

[I 2025-07-16 15:18:01,539] A new study created in RDB with name: XGBoost_Optimization
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


[I 2025-07-16 15:19:40,796] Trial 0 finished with value: 0.8230081717822018 and parameters: {'learning_rate': 0.003967363790633053, 'max_depth': 15, 'min_child_weight': 1, 'gamma': 0.26607430746014205, 'subsample': 0.5138416371789605, 'colsample_bytree': 0.6559573907678826, 'reg_alpha': 2.703

Best parameters:
{'learning_rate': 0.007388964985049714, 'max_depth': 8, 'min_child_weight': 6, 'gamma': 0.10500544259539538, 'subsample': 0.9614594865758529, 'colsample_bytree': 0.5180309501673342, 'reg_alpha': 1.1951933914998334, 'reg_lambda': 1.23064919032409, 'n_estimators': 1755, 'colsample_bylevel': 0.7361687322294498, 'max_delta_step': 1}


In [28]:
study=optuna.load_study(study_name='XGBoost_Optimization', storage='sqlite:////kaggle/working/XGB__optuna_study.db')

print('BEST PARAMETERS:')
for k, v in study.best_params.items():
    print(f"{k}: {v}")

BEST PARAMETERS:
learning_rate: 0.007388964985049714
max_depth: 8
min_child_weight: 6
gamma: 0.10500544259539538
subsample: 0.9614594865758529
colsample_bytree: 0.5180309501673342
reg_alpha: 1.1951933914998334
reg_lambda: 1.23064919032409
n_estimators: 1755
colsample_bylevel: 0.7361687322294498
max_delta_step: 1


### Train the model initialized with the best parameters

In [17]:
model=XGBClassifier(
    **study.best_params
)

model.fit(X_Train,Y_Train)
print('*****MODEL TRAINED*****')

# Saving the model
joblib.dump(model, "xgb_model.pkl")
print('*****MODEL SAVED*****')

NameError: name 'study' is not defined

In [41]:
model = joblib.load("xgb_model.pkl")

auc=roc_auc_score(Y_test, model.predict_proba(X_test)[:,1])
precision=precision_score(Y_test, model.predict(X_test))
recall=recall_score(Y_test, model.predict(X_test))
f1=f1_score(Y_test, model.predict(X_test))

print(f'AUC Score: {auc:5.3f}')
print(f'Precision: {precision:5.3f}')
print(f'Recall: {recall:5.3f}')
print(f'F1: {f1:5.3f}')

results={
    'AUC':auc,
    'PRECISION':precision,
    'RECALL':recall,
    'F1':f1
}

#saving the obtained results
with open("XGB_results.json", "w") as f:
    json.dump(results, f)

AUC Score: 0.834
Precision: 0.844
Recall: 0.942
F1: 0.890
