In [2]:
!pip install optuna

Collecting optuna
  Obtaining dependency information for optuna from https://files.pythonhosted.org/packages/05/3c/e9715756751e56f7df4b64c999650f418f6b48f73a824bbfe8e3604385e2/optuna-3.4.0-py3-none-any.whl.metadata
  Downloading optuna-3.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Obtaining dependency information for alembic>=1.5.0 from https://files.pythonhosted.org/packages/34/47/95d8f99c9f4a57079dfbcff5e023c5d81bde092d1c2354156340a56b3a1a/alembic-1.12.1-py3-none-any.whl.metadata
  Downloading alembic-1.12.1-py3-none-any.whl.metadata (7.3 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Obtaining dependency information for sqlalchemy>=1.3.0 from https://files.pythonhosted.org/packages/c3/3c/a79b9541de3eb2efeaa785b2f11acbcf6e16cc118c2791aa27ed23a448f8/SQLAlchemy-2.0.23-cp38-cp38-macosx_11_0_arm64.whl.metadata
  Downloading SQLAlchemy-2.0.23-cp38-cp38-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Obtaining de

In [21]:
"""
Optuna example that demonstrates a pruner for CatBoost.

In this example, we optimize the validation accuracy of cancer detection using CatBoost.
We optimize both the choice of booster models and their hyperparameters. Throughout
training of models, a pruner observes intermediate results and stop unpromising trials.

You can run this example as follows:
    $ python catboost_pruning.py

"""

import numpy as np
import optuna
from optuna.integration import CatBoostPruningCallback

import catboost as cb
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


def objective(trial: optuna.Trial) -> float:
    data, target = X_train.drop(columns=['click']), X_train.click
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)

    param = {
        "cat_features":[x for x in data.columns],
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "20gb",
        "eval_metric": "Accuracy",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = cb.CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
    gbm.fit(
        train_x,
        train_y,
        eval_set=[(valid_x, valid_y)],
        verbose=0,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
        cat_features=[x for x in data.columns]
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(valid_y, pred_labels)

    return accuracy


In [7]:
import pandas as pd

In [8]:
X_train, X_test = pd.read_csv('new_train.csv'), pd.read_csv('new_test.csv')

In [22]:
study = optuna.create_study(
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
)
study.optimize(objective, n_trials=100, timeout=600)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2023-11-14 21:04:18,542] A new study created in memory with name: no-name-fb7b9bc2-9df8-470c-9541-bd48f17aea54
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[I 2023-11-14 21:06:18,147] Trial 0 finished with value: 0.8311962513999865 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.04408623574391702, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.2007212290542648}. Best is trial 0 with value: 0.8311962513999865.
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[I 2023-11-14 21:07:19,762] Trial 1 finished with value: 0.8292451747460237 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.011830122731017861, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.8264633502688358}. Best is trial 0 with value: 0.8311962513999865.
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[I 2023-11-14 21:08:22,985] Trial 2 finished with value: 0.

Number of finished trials: 6
Best trial:
  Value: 0.832516097371785
  Params: 
    objective: Logloss
    colsample_bylevel: 0.05098905258775332
    depth: 11
    boosting_type: Ordered
    bootstrap_type: Bernoulli
    subsample: 0.17311817212056868


In [27]:
model = CatBoostClassifier(**trial.params)

In [28]:
X, y = X_train.drop(columns=['click']), X_train.click

In [None]:
model.fit(X, y, cat_features=[x for x in X.columns], plot=True)

Unnamed: 0,hour_01-03,hour_03-05,hour_05-07,hour_07-09,hour_09-11,hour_11-13,hour_13-15,hour_15-17,hour_17-19,hour_19-21,...,C18_1,C18_2,C18_3,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4577459,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4577460,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4577461,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4577462,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0


In [33]:
out = model.predict_proba(X_test.drop(columns=['click']))

In [34]:
submission = pd.read_csv('avazu-ctr-prediction/sampleSubmission.gz')

In [35]:
submission['click'] = out[:, 1]

In [37]:
submission.to_csv('submission.gz', compression='gzip', index=False)