In [131]:
#!pip install numpy pandas scikit-learn matplotlib catboost optuna
import numpy as np
import optuna
from optuna.integration import CatBoostPruningCallback

import catboost as cb
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [132]:
df = pd.read_csv('train.csv',nrows = 1000)
df_test = pd.read_csv('test.csv')
#drop ids
df.drop(df.columns[0],axis = 1, inplace=True)

In [133]:
#Change dtypes from object to column
for col in df.columns[(df.dtypes == 'object')]:
    df[col] = df[col].astype('str').astype('category')
    df_test[col] = df_test[col].astype('str').astype('category')



#find indexes of cat_features for catboost
l = df.columns[(df.dtypes == 'category')]
cat_features = [list(df.columns).index(u) for u in l]

df['Transported'] = df['Transported'].astype('int8')


#create X_test
X_test = df_test.iloc[:,1:]
out = df_test.iloc[:,0]

In [134]:
"""
Optuna example that demonstrates a pruner for CatBoost.
In this example, we optimize the validation accuracy of cancer detection using CatBoost.
We optimize both the choice of booster models and their hyperparameters. Throughout
training of models, a pruner observes intermediate results and stop unpromising trials.
"""

def objective(trial: optuna.Trial) -> float:
    data, target = df.drop('Transported',axis = 1),df['Transported']
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
        "eval_metric": "Accuracy",
        "cat_features": cat_features
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = cb.CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
    gbm.fit(
        train_x,
        train_y,
        eval_set=[(valid_x, valid_y)],
        verbose=0,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(valid_y, pred_labels)

    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
    )
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2022-12-16 17:54:11,220][0m A new study created in memory with name: no-name-d7c315ed-59c6-4b1f-a262-bbe6406d4568[0m
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[32m[I 2022-12-16 17:54:14,201][0m Trial 0 finished with value: 0.776 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.08570252728855927, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.1183884275776284}. Best is trial 0 with value: 0.776.[0m
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[32m[I 2022-12-16 17:54:15,155][0m Trial 1 finished with value: 0.78 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.061481818944880764, 'depth': 3, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.31464026926456506}. Best is trial 1 with value: 0.78.[0m
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[32m[I 2022-12-16 17:54:16,719][0m Trial 2 finished with value: 0.792 and

Number of finished trials: 100
Best trial:
  Value: 0.824
  Params: 
    objective: CrossEntropy
    colsample_bylevel: 0.09805148601310225
    depth: 9
    boosting_type: Plain
    bootstrap_type: Bernoulli
    subsample: 0.7063154490355584


In [135]:
bp = study.best_params
bp['cat_features'] = cat_features

In [136]:
results = cb.CatBoostClassifier(**bp).fit(df.drop('Transported',axis = 1),df['Transported']).predict(X_test)

0:	learn: 0.6931443	total: 2.55ms	remaining: 2.55s
1:	learn: 0.6931415	total: 4.56ms	remaining: 2.27s
2:	learn: 0.6777604	total: 27.6ms	remaining: 9.16s
3:	learn: 0.6680232	total: 60ms	remaining: 14.9s
4:	learn: 0.6541094	total: 115ms	remaining: 22.8s
5:	learn: 0.6487142	total: 130ms	remaining: 21.5s
6:	learn: 0.6487091	total: 132ms	remaining: 18.7s
7:	learn: 0.6455251	total: 134ms	remaining: 16.7s
8:	learn: 0.6455210	total: 136ms	remaining: 15s
9:	learn: 0.6400052	total: 166ms	remaining: 16.5s
10:	learn: 0.6370730	total: 169ms	remaining: 15.2s
11:	learn: 0.6273257	total: 225ms	remaining: 18.6s
12:	learn: 0.6218290	total: 241ms	remaining: 18.3s
13:	learn: 0.6191581	total: 250ms	remaining: 17.6s
14:	learn: 0.6088679	total: 275ms	remaining: 18.1s
15:	learn: 0.6034428	total: 300ms	remaining: 18.5s
16:	learn: 0.6034381	total: 303ms	remaining: 17.5s
17:	learn: 0.6014440	total: 305ms	remaining: 16.6s
18:	learn: 0.5971317	total: 343ms	remaining: 17.7s
19:	learn: 0.5930518	total: 415ms	remaini

In [137]:
results = pd.concat([out,pd.Series(results)],axis = 1)

In [138]:
results.to_csv('results.csv')