In [55]:
#!pip install numpy pandas scikit-learn matplotlib catboost optuna
import pandas as pd
import numpy as np
import optuna
from optuna.integration import CatBoostPruningCallback

import catboost as cb
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [56]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
#drop ids
df.drop(df.columns[0],axis = 1, inplace=True)

In [64]:
df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,deck,num,side,Fname,Sname
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,B,0,P,Maham,Ofracculy
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,F,0,S,Juanna,Vines
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,A,0,S,Altark,Susent
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,A,0,S,Solam,Susent
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,F,1,S,Willy,Santantines


0           0.0
1         736.0
2       10383.0
3        5176.0
4        1091.0
         ...   
8688     8536.0
8689        0.0
8690     1873.0
8691     4637.0
8692     4826.0
Length: 8693, dtype: float64

In [57]:
#Feature creation
df[['deck', 'num','side']] = df['Cabin'].str.split(pat = '/', n = 2, expand=True)
df_test[['deck', 'num','side']] = df_test['Cabin'].str.split(pat = '/', n = 2, expand=True)

df[['Fname', 'Sname']] = df['Name'].str.split(pat = ' ', n = 1, expand=True)
df_test[['Fname', 'Sname']] = df_test['Name'].str.split(pat = ' ', n = 1, expand=True)

df['Spending'] = df[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis = 1)
df_test['Spending'] = df_test[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].sum(axis = 1)

#Target
Target = df['Transported'].astype('int8')


#Data manipulation
#Drop name, I think it's noise
df.drop(['Name','Cabin','Transported'],axis = 1, inplace = True)
df_test.drop(['Name','Cabin'],axis = 1, inplace = True)

In [58]:
#Change dtypes from object to column
for col in df.columns[(df.dtypes == 'object')]:
    df[col] = df[col].astype('str').astype('category')
    df_test[col] = df_test[col].astype('str').astype('category')



#find indexes of cat_features for catboost
l = df.columns[(df.dtypes == 'category')]
cat_features = [list(df.columns).index(u) for u in l]



#create X_test
X_test = df_test.iloc[:,1:]
out = df_test.iloc[:,0]

In [60]:
"""
Optuna example that demonstrates a pruner for CatBoost.
In this example, we optimize the validation accuracy of cancer detection using CatBoost.
We optimize both the choice of booster models and their hyperparameters. Throughout
training of models, a pruner observes intermediate results and stop unpromising trials.
"""

def objective(trial: optuna.Trial) -> float:
    data, target = df,Target
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
        "eval_metric": "Accuracy",
        "cat_features": cat_features
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    gbm = cb.CatBoostClassifier(**param)

    pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
    gbm.fit(
        train_x,
        train_y,
        eval_set=[(valid_x, valid_y)],
        verbose=0,
        early_stopping_rounds=100,
        callbacks=[pruning_callback],
    )

    # evoke pruning manually.
    pruning_callback.check_pruned()

    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(valid_y, pred_labels)

    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="maximize"
    )
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2022-12-17 23:03:19,719][0m A new study created in memory with name: no-name-0feebe5b-b138-4a1b-81e7-344428bf2e62[0m
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[32m[I 2022-12-17 23:03:21,445][0m Trial 0 finished with value: 0.7534498620055198 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.018614505605841686, 'depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 7.443522448760261}. Best is trial 0 with value: 0.7534498620055198.[0m
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[32m[I 2022-12-17 23:03:32,151][0m Trial 1 finished with value: 0.7690892364305428 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.013952393157599437, 'depth': 7, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.7690892364305428.[0m
  pruning_callback = CatBoostPruningCallback(trial, "Accuracy")
[32m[I 2022-12-17 23:03:34,575][0m Trial 2 finished wi

KeyboardInterrupt: 

In [None]:
bp = study.best_params
bp['cat_features'] = cat_features

In [None]:
results = cb.CatBoostClassifier(**bp).fit(df,Target).predict(X_test)

0:	learn: 0.6927640	total: 13.4ms	remaining: 13.4s
1:	learn: 0.6912281	total: 16.8ms	remaining: 8.37s
2:	learn: 0.6911367	total: 21.6ms	remaining: 7.16s
3:	learn: 0.6835135	total: 42.3ms	remaining: 10.5s
4:	learn: 0.6835109	total: 44.8ms	remaining: 8.9s
5:	learn: 0.6835084	total: 47.1ms	remaining: 7.81s
6:	learn: 0.6835060	total: 49.4ms	remaining: 7.01s
7:	learn: 0.6702324	total: 65ms	remaining: 8.06s
8:	learn: 0.6619049	total: 103ms	remaining: 11.4s
9:	learn: 0.6618984	total: 106ms	remaining: 10.5s
10:	learn: 0.6500916	total: 142ms	remaining: 12.8s
11:	learn: 0.6500826	total: 145ms	remaining: 11.9s
12:	learn: 0.6472579	total: 147ms	remaining: 11.2s
13:	learn: 0.6388168	total: 168ms	remaining: 11.8s
14:	learn: 0.6388107	total: 171ms	remaining: 11.2s
15:	learn: 0.6388050	total: 173ms	remaining: 10.6s
16:	learn: 0.6387996	total: 175ms	remaining: 10.1s
17:	learn: 0.6325948	total: 187ms	remaining: 10.2s
18:	learn: 0.6263083	total: 191ms	remaining: 9.87s
19:	learn: 0.6263044	total: 193ms	re

In [None]:
results = pd.concat([out,pd.Series(results)],axis = 1)
results.columns = results.columns[0],'Transported'
results.set_index('PassengerId',drop=True, inplace = True)
results.Transported = results.Transported.astype('bool')

In [None]:
results.to_csv('results.csv')

In [63]:
results

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,False
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True
