In [1]:
import optuna
import pandas as pd
import numpy as np
import xgboost as xgb

from dotenv import dotenv_values
from sklearn.model_selection import train_test_split

In [2]:
config = dotenv_values('../.env')

In [3]:
train = pd.read_parquet(config["ENGINEERED_DATA"] + "train_fe.parquet")

In [4]:
def amex_metric_mod(y_true, y_pred):
    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [8]:
def objective(trial):
    FEATURES = train.columns[1:-1]
    X_train, X_valid, y_train, y_valid = train_test_split(train[FEATURES],
                                                          train["target"],
                                                          test_size=0.2)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    
    param = {
        "verbosity": 0,
        "objective": trial.suggest_categorical("objective", ["reg:squarederror", "reg:logistic", "reg:squaredlogerror",
                                                             "binary:logistic","binary:logitraw","binary:hinge"]),
        "tree_method": "gpu_hist",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }
    
    if param["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
    
    bst = xgb.train(param, dtrain)
    preds = bst.predict(dvalid)
    metric = amex_metric_mod(y_valid.values, preds)
    return metric

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=254, timeout=3600)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-08-12 16:09:53,762][0m A new study created in memory with name: no-name-2daa4539-37d5-4850-a993-e2f5dbffff32[0m
[32m[I 2022-08-12 16:10:13,250][0m Trial 0 finished with value: 0.7571127778361499 and parameters: {'objective': 'reg:squarederror', 'booster': 'gbtree', 'lambda': 0.6508481050247027, 'alpha': 4.100134379243512e-06, 'subsample': 0.783257564439271, 'colsample_bytree': 0.37292309268031504, 'max_depth': 9, 'min_child_weight': 8, 'eta': 2.891115991276801e-06, 'gamma': 1.4236313202777436e-08, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.7571127778361499.[0m
[32m[I 2022-08-12 16:10:27,194][0m Trial 1 finished with value: 0.5701205871056256 and parameters: {'objective': 'binary:logitraw', 'booster': 'gblinear', 'lambda': 0.33684975991570704, 'alpha': 0.595861990096303, 'subsample': 0.8081511111046056, 'colsample_bytree': 0.6132174078520232}. Best is trial 0 with value: 0.7571127778361499.[0m
[32m[I 2022-08-12 16:11:06,040][0m Trial 2 finished wit

[32m[I 2022-08-12 16:14:59,348][0m Trial 19 finished with value: 0.7582846421099025 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 1.5677975377242596e-08, 'alpha': 0.001479318467664559, 'subsample': 0.21057302784476045, 'colsample_bytree': 0.9010458709016627}. Best is trial 9 with value: 0.7704008142978007.[0m
[32m[I 2022-08-12 16:15:13,478][0m Trial 20 finished with value: 0.6914296773168787 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 5.896404482315677e-07, 'alpha': 0.025474479988551135, 'subsample': 0.4582525046051482, 'colsample_bytree': 0.9136589539206051}. Best is trial 9 with value: 0.7704008142978007.[0m
[32m[I 2022-08-12 16:15:28,153][0m Trial 21 finished with value: 0.7599849658149589 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 1.1256554657150104e-08, 'alpha': 0.0008125876728574089, 'subsample': 0.23699572886703107, 'colsample_bytree': 0.8777009533343687}. Best is tria

[32m[I 2022-08-12 16:20:45,981][0m Trial 41 finished with value: 0.7538289748458368 and parameters: {'objective': 'reg:squarederror', 'booster': 'gbtree', 'lambda': 0.2430430546683116, 'alpha': 2.5107479915620354e-07, 'subsample': 0.7832089191891932, 'colsample_bytree': 0.479666218700728, 'max_depth': 9, 'min_child_weight': 9, 'eta': 3.516212930829288e-06, 'gamma': 2.5858314527787784e-07, 'grow_policy': 'depthwise'}. Best is trial 9 with value: 0.7704008142978007.[0m
[32m[I 2022-08-12 16:20:58,954][0m Trial 42 finished with value: 0.7450505212681061 and parameters: {'objective': 'reg:squarederror', 'booster': 'gbtree', 'lambda': 0.0008779455974485153, 'alpha': 8.753335922866578e-07, 'subsample': 0.6577857352556374, 'colsample_bytree': 0.3848656241109863, 'max_depth': 7, 'min_child_weight': 7, 'eta': 2.860248605437734e-06, 'gamma': 7.198838288291181e-08, 'grow_policy': 'depthwise'}. Best is trial 9 with value: 0.7704008142978007.[0m
[32m[I 2022-08-12 16:21:12,658][0m Trial 43 fi

[32m[I 2022-08-12 16:25:59,371][0m Trial 60 finished with value: 0.7596927893590475 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 0.03519528727159428, 'alpha': 7.146228154968934e-05, 'subsample': 0.9756613971506845, 'colsample_bytree': 0.9213670165932192}. Best is trial 9 with value: 0.7704008142978007.[0m
[32m[I 2022-08-12 16:26:14,604][0m Trial 61 finished with value: 0.762276772428948 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 0.029970684515643633, 'alpha': 5.159262680494899e-05, 'subsample': 0.9706436460650941, 'colsample_bytree': 0.992907100914146}. Best is trial 9 with value: 0.7704008142978007.[0m
[32m[I 2022-08-12 16:26:30,414][0m Trial 62 finished with value: 0.7709672605698499 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 0.010694712826538255, 'alpha': 0.0002062906081111435, 'subsample': 0.9572490611717799, 'colsample_bytree': 0.9878569814841227}. Best is trial 62 wit

[32m[I 2022-08-12 16:31:56,920][0m Trial 83 finished with value: 0.7673366078008107 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 0.0017970054742648958, 'alpha': 1.009900530971288e-06, 'subsample': 0.9552592226556991, 'colsample_bytree': 0.9072577436228446}. Best is trial 71 with value: 0.7727049110548927.[0m
[32m[I 2022-08-12 16:32:12,086][0m Trial 84 finished with value: 0.7597315409982883 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 0.012634047129904177, 'alpha': 0.00034916272086561284, 'subsample': 0.9027856468434461, 'colsample_bytree': 0.9454886708722555}. Best is trial 71 with value: 0.7727049110548927.[0m
[32m[I 2022-08-12 16:32:27,838][0m Trial 85 finished with value: 0.7324822232958654 and parameters: {'objective': 'binary:logitraw', 'booster': 'gblinear', 'lambda': 0.00025360615547247536, 'alpha': 6.864522460753871e-06, 'subsample': 0.9991345271515129, 'colsample_bytree': 0.931937234673506}. Best is t

[32m[I 2022-08-12 16:38:00,851][0m Trial 106 finished with value: 0.7568268126701689 and parameters: {'objective': 'binary:logistic', 'booster': 'gblinear', 'lambda': 0.0005619742283868104, 'alpha': 0.0005320289535847335, 'subsample': 0.9812386506361714, 'colsample_bytree': 0.7915097068023148}. Best is trial 71 with value: 0.7727049110548927.[0m
[32m[I 2022-08-12 16:38:38,348][0m Trial 107 finished with value: 0.5664149432286782 and parameters: {'objective': 'binary:hinge', 'booster': 'gblinear', 'lambda': 0.040566304903855016, 'alpha': 6.984249940620777e-05, 'subsample': 0.9410376629529419, 'colsample_bytree': 0.9821944544954057}. Best is trial 71 with value: 0.7727049110548927.[0m
[32m[I 2022-08-12 16:38:51,611][0m Trial 108 finished with value: 0.6740884487243767 and parameters: {'objective': 'reg:logistic', 'booster': 'dart', 'lambda': 0.06285735422496931, 'alpha': 1.4820338695251934e-06, 'subsample': 0.907973144322109, 'colsample_bytree': 0.9113906055370903, 'max_depth': 3

[32m[I 2022-08-12 16:44:02,664][0m Trial 129 finished with value: 0.6984325712520798 and parameters: {'objective': 'reg:logistic', 'booster': 'dart', 'lambda': 0.0004977425941637496, 'alpha': 0.00022512832744341066, 'subsample': 0.9998278399897421, 'colsample_bytree': 0.9601215187347737, 'max_depth': 5, 'min_child_weight': 6, 'eta': 0.0015918097688596932, 'gamma': 0.0032108735696261604, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.03460023841908398, 'skip_drop': 0.01893674538953361}. Best is trial 122 with value: 0.7741670411669009.[0m
[32m[I 2022-08-12 16:44:17,122][0m Trial 130 finished with value: 0.7685443944475426 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 0.004527405728133453, 'alpha': 0.0006240403443623519, 'subsample': 0.9545983503270011, 'colsample_bytree': 0.9986777049077159}. Best is trial 122 with value: 0.7741670411669009.[0m
[32m[I 2022-08-12 16:44:32,773][0m Trial 131 finished w

[32m[I 2022-08-12 16:49:52,260][0m Trial 152 finished with value: 0.765159189407401 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 0.011548294291701075, 'alpha': 1.9614693308536967e-06, 'subsample': 0.935216376114401, 'colsample_bytree': 0.8960166353181849}. Best is trial 122 with value: 0.7741670411669009.[0m
[32m[I 2022-08-12 16:50:06,862][0m Trial 153 finished with value: 0.7697440226154424 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 0.006709162576359382, 'alpha': 3.475496274344824e-06, 'subsample': 0.9627509356186382, 'colsample_bytree': 0.9478664206697636}. Best is trial 122 with value: 0.7741670411669009.[0m
[32m[I 2022-08-12 16:50:21,545][0m Trial 154 finished with value: 0.7751803335337588 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 0.0021621509593986745, 'alpha': 0.00028126701524793755, 'subsample': 0.9508284552641411, 'colsample_bytree': 0.8599729375333017}. Best is t

[32m[I 2022-08-12 16:56:03,042][0m Trial 175 finished with value: 0.7685357137968465 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 0.006456576979788695, 'alpha': 3.8968785303318864e-05, 'subsample': 0.9384049514848103, 'colsample_bytree': 0.983973105179116}. Best is trial 154 with value: 0.7751803335337588.[0m
[32m[I 2022-08-12 16:56:16,509][0m Trial 176 finished with value: 0.618409206487382 and parameters: {'objective': 'reg:logistic', 'booster': 'gbtree', 'lambda': 0.016110327119052168, 'alpha': 1.982537358438002e-05, 'subsample': 0.9835199036243344, 'colsample_bytree': 0.9986790612623394, 'max_depth': 5, 'min_child_weight': 5, 'eta': 1.1728621017175875e-08, 'gamma': 0.0008873409597401572, 'grow_policy': 'lossguide'}. Best is trial 154 with value: 0.7751803335337588.[0m
[32m[I 2022-08-12 16:56:31,340][0m Trial 177 finished with value: 0.7659651140936472 and parameters: {'objective': 'reg:logistic', 'booster': 'gblinear', 'lambda': 0.009852892