In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import catboost as cb
import optuna
import shap
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [4]:
train = pd.read_csv('train.csv.zip', compression='zip')

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900000 entries, 0 to 899999
Data columns (total 33 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      900000 non-null  int64  
 1   f_00    900000 non-null  float64
 2   f_01    900000 non-null  float64
 3   f_02    900000 non-null  float64
 4   f_03    900000 non-null  float64
 5   f_04    900000 non-null  float64
 6   f_05    900000 non-null  float64
 7   f_06    900000 non-null  float64
 8   f_07    900000 non-null  int64  
 9   f_08    900000 non-null  int64  
 10  f_09    900000 non-null  int64  
 11  f_10    900000 non-null  int64  
 12  f_11    900000 non-null  int64  
 13  f_12    900000 non-null  int64  
 14  f_13    900000 non-null  int64  
 15  f_14    900000 non-null  int64  
 16  f_15    900000 non-null  int64  
 17  f_16    900000 non-null  int64  
 18  f_17    900000 non-null  int64  
 19  f_18    900000 non-null  int64  
 20  f_19    900000 non-null  float64
 21  f_20    90

In [6]:
X = train.drop('target',axis=1)
y = train[['target']]
lb = LabelEncoder()
X.f_27 = lb.fit_transform(X.f_27)

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 900000 entries, 0 to 899999
Data columns (total 32 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      900000 non-null  int64  
 1   f_00    900000 non-null  float64
 2   f_01    900000 non-null  float64
 3   f_02    900000 non-null  float64
 4   f_03    900000 non-null  float64
 5   f_04    900000 non-null  float64
 6   f_05    900000 non-null  float64
 7   f_06    900000 non-null  float64
 8   f_07    900000 non-null  int64  
 9   f_08    900000 non-null  int64  
 10  f_09    900000 non-null  int64  
 11  f_10    900000 non-null  int64  
 12  f_11    900000 non-null  int64  
 13  f_12    900000 non-null  int64  
 14  f_13    900000 non-null  int64  
 15  f_14    900000 non-null  int64  
 16  f_15    900000 non-null  int64  
 17  f_16    900000 non-null  int64  
 18  f_17    900000 non-null  int64  
 19  f_18    900000 non-null  int64  
 20  f_19    900000 non-null  float64
 21  f_20    90

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, shuffle=True)

In [15]:
class Model:
    __slots__ = ('best_estimator', 'catboost_model', 'X_train', 'X_test', 'y_train', 'y_test')
    
    def __init__(self, X, y):
        self.best_estimator = None
        self.catboost_model = None
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X,y, test_size = 0.2, shuffle=True)
        
    def callback(self, study, trial):
        if study.best_trial.number == trial.number:
            self.best_booster = self.catboost_model
            
    def obj(self, trial):
        model = cb.CatBoostClassifier(
        iterations=trial.suggest_int("iterations", 1, 1000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 10),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        od_wait=trial.suggest_int("od_wait", 10, 50),
        verbose=False
    )
        model.fit(self.X_train, self.y_train)
        self.catboost_model = model
        y_pred = model.predict(self.X_test)
        return accuracy_score(self.y_test, y_pred)
    
    def test(self):
        study = optuna.create_study(study_name="catboost", direction="maximize")
        study.optimize(self.obj, n_trials=5, n_jobs=-1, callbacks=[self.callback])
        return self.best_estimator




In [16]:
model = Model(X, y)

In [17]:
best = model.test()

[I 2023-10-10 22:12:38,392] A new study created in memory with name: catboost
[I 2023-10-10 22:12:49,174] Trial 2 finished with value: 0.6658444444444445 and parameters: {'iterations': 19, 'learning_rate': 0.011247003301496892, 'depth': 6, 'l2_leaf_reg': 1.8428683309314092e-07, 'bootstrap_type': 'Bayesian', 'random_strength': 2.966472916962021, 'bagging_temperature': 3.595347449035083, 'od_type': 'IncToDec', 'od_wait': 41}. Best is trial 2 with value: 0.6658444444444445.
[I 2023-10-10 22:16:38,228] Trial 4 finished with value: 0.7638444444444444 and parameters: {'iterations': 477, 'learning_rate': 0.029996824705549877, 'depth': 5, 'l2_leaf_reg': 1.5538685777965289, 'bootstrap_type': 'Bayesian', 'random_strength': 0.0003063441878101009, 'bagging_temperature': 7.18178290854283, 'od_type': 'IncToDec', 'od_wait': 19}. Best is trial 4 with value: 0.7638444444444444.
[I 2023-10-10 22:17:12,149] Trial 1 finished with value: 0.7252444444444445 and parameters: {'iterations': 796, 'learning_rate

In [20]:
type(best)

NoneType