In [6]:
import xgboost as xgb
import optuna
import pandas as pd
from google.cloud import bigquery
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from etrace.load_data import load_from_bq

# =============================================================================
# 1. LOAD DATA FROM BIGQUERY
# =============================================================================


df = load_from_bq("SELECT * FROM aklewagonproject.etrace.cleaned_final_interpolated_dataset")








# -----------------------------------------------------------------------------
# ⚠️ ACTION REQUIRED: Define your Target Column
# -----------------------------------------------------------------------------
target_col = 'nights_spent'  # <--- REPLACE THIS with your actual target column name

# Check if target exists to avoid errors
if target_col not in df.columns:
    raise ValueError(f"Column '{target_col}' not found in dataset. Available columns: {list(df.columns)}")

# Split into X (features) and y (target)
X = df.drop(columns=[target_col])
y = df[target_col]

print(f"Data loaded. Shape: {df.shape}")

# =============================================================================
# 2. MODEL SETUP & OPTIMIZATION
# =============================================================================
max_bin = 255

# Convert the datasets to QuantileDMatrix for training and validation
# Note: QuantileDMatrix is optimized for the 'hist' tree method
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=7, stratify=X['geo'])

dtrain = xgb.QuantileDMatrix(X_train, y_train, max_bin=max_bin)
dval = xgb.QuantileDMatrix(X_test, y_test, max_bin=max_bin)

def objective(trial):
    params = {
        # ⚠️ NOTE: If this is binary classification (AUC), use 'binary:logistic'
        # 'reg:squarederror' is for regression, but can work for classification proxies.
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'tree_method': 'hist',
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.5, log=True),
        'sampling_method': 'gradient_based',
        # 'grow_gpu_hist' is deprecated in newer XGBoost versions; 'hist' with 'device': 'cuda' is preferred if using GPU
        'updater': 'grow_gpu_hist,prune',
        'nthread': -1,
        'max_bin': max_bin,
        'max_delta_step': trial.suggest_int('max_delta_step', 1, 20),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10.0),
    }

    model = xgb.train(
        params,
        dtrain,
        evals=[(dval, 'validation')],
        early_stopping_rounds=50,
        num_boost_round=5000,
        verbose_eval=False # Set to False to reduce clutter during 1000 trials
    )

    preds = model.predict(dval)

    # Calculate AUC
    auc = roc_auc_score(y_test, preds)

    return auc

# Create a study and optimize the objective function
# CHANGED: direction="maximize" because we are optimizing AUC (higher is better)
study = optuna.create_study(
        direction="maximize",
        study_name="etrace_optimization",
        load_if_exists=True,
)

print("Starting optimization...")
study.optimize(objective, n_trials=1000)
print("Best params:", study.best_params)




Data loaded. Shape: (4317, 32)


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [7]:
df

Unnamed: 0,geo,year,pop,employment_rate,gdp_capita,NUTS_NAME,area_km2,pop_dens,pct_Dfb,pct_Dfc,...,pct_Cfc,pct_BWh,pct_Af,pct_Am,pct_Aw,pct_Cwa,pct_Cwb,pct_Csc,pct_Dsa,nights_spent
0,ES52,2000,4103816.0,62.5,15200.0,Comunitat Valenciana,23264.016726,176.401868,0.000015,0.0,...,0.0,0.058283,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,32567572.0
1,ES52,2001,4135183.0,63.9,16500.0,Comunitat Valenciana,23264.016726,177.750173,0.000012,0.0,...,0.0,0.059404,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,33505671.0
2,ES52,2002,4192277.0,64.5,17200.0,Comunitat Valenciana,23264.016726,180.204349,0.000009,0.0,...,0.0,0.060528,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,33564801.0
3,ES52,2003,4322066.0,64.9,17900.0,Comunitat Valenciana,23264.016726,185.783309,0.000006,0.0,...,0.0,0.061654,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,34536607.0
4,ES52,2004,4441941.0,66.7,18600.0,Comunitat Valenciana,23264.016726,190.936116,0.000003,0.0,...,0.0,0.062782,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,35862548.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4312,TRC3,2015,2153921.0,35.4,4700.0,"Mardin, Batman, Şırnak, Siirt",26050.005235,82.684091,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.180624,340630.0
4313,TRC3,2016,2173759.0,34.7,4600.0,"Mardin, Batman, Şırnak, Siirt",26050.005235,83.445626,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.179354,366460.0
4314,TRC3,2018,2222601.0,37.6,4000.0,"Mardin, Batman, Şırnak, Siirt",26050.005235,85.320559,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.176810,1050019.0
4315,TRC3,2019,2284158.0,36.9,4200.0,"Mardin, Batman, Şırnak, Siirt",26050.005235,87.683591,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.175536,1177516.0
