In [13]:
from pathlib import Path
import yaml
import pandas as pd

from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier
from catboost import EFeaturesSelectionAlgorithm, EShapCalcType

## Constants

In [3]:
RANDOM_SEED = 77
N_JOBS = 7
DATA_PATH = Path("../../data/")

## Data

In [4]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc.parquet")

ohe_cols = df_train.columns[df_train.columns.str.startswith("OneHotEncoder")].values.tolist()
oe_cols = df_train.columns[df_train.columns.str.startswith("OrdinalEncoder")].values.tolist()
te_cols = df_train.columns[df_train.columns.str.startswith("MeanTargetEncoder")].values.tolist()

In [5]:
# take numeric columns
X_train, y_train = df_train.drop(columns=["id", "target", "smpl"] + te_cols + ohe_cols + oe_cols), df_train["target"]

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.3, random_state=RANDOM_SEED)

## Model

In [11]:
model = CatBoostClassifier(random_state=RANDOM_SEED, verbose=0, early_stopping_rounds=200, iterations=2500,
                           thread_count=N_JOBS, allow_writing_files=False)

summary = model.select_features(X_train, y_train, eval_set=(X_val, y_val),
                                features_for_select=X_train.columns,
                                num_features_to_select=50,
                                verbose=False,
                                algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
                                shap_calc_type=EShapCalcType.Regular, steps=10)

Learning rate set to 0.086245
Step #1 out of 10
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.1892381509
bestIteration = 406

Shrink model to first 407 iterations.
Feature #43 eliminated
Feature #88 eliminated
Feature #130 eliminated
Feature #20 eliminated
Feature #120 eliminated
Feature #3 eliminated
Feature #138 eliminated
Feature #145 eliminated
Feature #13 eliminated
Feature #102 eliminated
Feature #96 eliminated
Feature #24 eliminated
Feature #85 eliminated
Feature #100 eliminated
Feature #67 eliminated
Step #2 out of 10
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.1891319461
bestIteration = 377

Shrink model to first 378 iterations.
Feature #133 eliminated
Feature #95 eliminated
Feature #82 eliminated
Feature #46 eliminated
Feature #121 eliminated
Feature #142 eliminated
Feature #97 eliminated
Feature #111 eliminated
Feature #31 eliminated
Feature #71 eliminated
Feature #146 eliminated
Feature #81 eliminated
Feature #76 eliminated
Fe

In [17]:
with open("../../configs/config.yaml", "a") as f:
    yaml.dump({"selected_features": summary["selected_features_names"]}, f)