[Reference](https://medium.com/@Rohan_Dutt/why-gradient-boosting-often-beats-deep-learning-on-tabular-data-and-how-to-tune-it-17c4c59b1782)

# 1. Control Tree Depth Before Anything Else

In [1]:
max_depth = 4    # try 3 to 6
num_leaves = 2**max_depth

# 2. Shrink With Learning Rate

In [2]:
learning_rate = 0.03
n_estimators = 1500

# 3. Use Feature Subsampling

In [3]:
colsample_bytree = 0.7
subsample = 0.8

# 4. Regularize Hard on Small Datasets

In [4]:
lambda_l1 = 2.0
lambda_l2 = 5.0
min_child_weight = 10

# Minimal Python Template to Tune Gradient Boosting

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
model = LGBMClassifier(
    max_depth=5,
    num_leaves=32,
    learning_rate=0.03,
    n_estimators=1200,
    colsample_bytree=0.7,
    subsample=0.8,
    lambda_l1=2.0,
    lambda_l2=5.0,
    min_child_weight=10
)
model.fit(X_train, y_train)
preds = model.predict_proba(X_val)[:, 1]
print("AUC:", roc_auc_score(y_val, preds))

# Clean Boosting Baseline

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
model = XGBClassifier(
    max_depth=5,
    learning_rate=0.05,
    n_estimators=800,
    subsample=0.8,
    colsample_bytree=0.7,
    reg_alpha=2.0,
    reg_lambda=5.0,
    min_child_weight=10,
    eval_metric="auc"
)
model.fit(X_train, y_train)
preds = model.predict_proba(X_val)[:, 1]
print("AUC:", roc_auc_score(y_val, preds))

# Clean Hyperparameter Search That Actually Works

In [8]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 3, 5),
        "learning_rate": trial.suggest_float("learning_rate", 0.05, 0.2),
        "n_estimators": trial.suggest_int("n_estimators", 400, 1200),
        "subsample": trial.suggest_float("subsample", 0.7, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 1, 10),
        "min_child_weight": trial.suggest_int("min_child_weight", 5, 20),
        "eval_metric": "auc"
    }
    model = XGBClassifier(**params)
    score = cross_val_score(model, X, y, cv=3, scoring="roc_auc").mean()
    return score
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)
print("Best params:", study.best_params)

But deep learning is only worth the pain if you are dealing with:
- ✔ Hierarchical data, like nested patient histories
- ✔ More than 500 unique categories where tree splits fall apart
- ✔ Hidden temporal or sensor patterns that trees cannot model cleanly

# Simple Hybrid Tree + Neural Head

In [9]:
import numpy as np
from lightgbm import LGBMRegressor
from tensorflow.keras import layers, models

# Step 1. Train a boosting model and extract leaf indexes as features
gboost = LGBMRegressor(
    max_depth=5,
    n_estimators=800,
    learning_rate=0.05,
    subsample=0.8
)
gboost.fit(X_train, y_train)
leaf_train = gboost.predict(X_train, pred_leaf=True)
leaf_val = gboost.predict(X_val, pred_leaf=True)
# Step 2. One hot encode leaf indices
leaf_train = np.array([np.eye(gboost.n_estimators, dtype="float32")[row] for row in leaf_train])
leaf_val = np.array([np.eye(gboost.n_estimators, dtype="float32")[row] for row in leaf_val])
# Step 3. Tiny neural head
model = models.Sequential([
    layers.Dense(64, activation="relu", input_shape=(leaf_train.shape[1],)),
    layers.Dense(1, activation="linear")
])
model.compile(optimizer="adam", loss="mse")
model.fit(leaf_train, y_train, epochs=15, batch_size=64, verbose=0)
preds = model.predict(leaf_val).flatten()