In [6]:
from pathlib import Path
import yaml
import pandas as pd

from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier
from catboost import EFeaturesSelectionAlgorithm, EShapCalcType
from catboost import Pool

## Constants

In [2]:
RANDOM_SEED = 77
N_JOBS = 7
DATA_PATH = Path("../../data/")

## Data

In [8]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc.parquet")

ohe_cols = df_train.columns[df_train.columns.str.startswith("OneHotEncoder")].values.tolist()
oe_cols = df_train.columns[df_train.columns.str.startswith("OrdinalEncoder")].values.tolist()
te_cols = df_train.columns[df_train.columns.str.startswith("MeanTargetEncoder")].values.tolist()
num_cols = df_train.columns[df_train.columns.str.startswith("numeric")].values.tolist()

In [4]:
X_train, y_train = df_train.drop(columns=["id", "target", "smpl"] + te_cols), df_train["target"]

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.3, random_state=RANDOM_SEED)

## Model

In [7]:
train_pool = Pool(X_train, y_train, cat_features=oe_cols + ohe_cols)
val_pool = Pool(X_val, y_val, cat_features=oe_cols + ohe_cols)

In [12]:
model = CatBoostClassifier(random_state=RANDOM_SEED, verbose=0, early_stopping_rounds=200, iterations=2500,
                           thread_count=N_JOBS, allow_writing_files=False)

summary = model.select_features(train_pool, eval_set=val_pool,
                                features_for_select=num_cols,
                                num_features_to_select=50,
                                verbose=False,
                                algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
                                shap_calc_type=EShapCalcType.Regular, steps=10)

Learning rate set to 0.086245
Step #1 out of 10
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.189491792
bestIteration = 228

Shrink model to first 229 iterations.
Feature #155 eliminated
Feature #37 eliminated
Feature #93 eliminated
Feature #105 eliminated
Feature #73 eliminated
Feature #162 eliminated
Feature #53 eliminated
Feature #74 eliminated
Feature #115 eliminated
Feature #41 eliminated
Feature #152 eliminated
Feature #30 eliminated
Feature #60 eliminated
Feature #63 eliminated
Feature #114 eliminated
Step #2 out of 10
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.1890863225
bestIteration = 396

Shrink model to first 397 iterations.
Feature #150 eliminated
Feature #119 eliminated
Feature #88 eliminated
Feature #113 eliminated
Feature #47 eliminated
Feature #112 eliminated
Feature #117 eliminated
Feature #20 eliminated
Feature #33 eliminated
Feature #147 eliminated
Feature #44 eliminated
Feature #84 eliminated
Feature #137 eliminated


In [17]:
with open("../../configs/config.yaml", "a") as f:
    yaml.dump({"selected_features": summary["selected_features_names"]}, f)