In [1]:
from pathlib import Path
import yaml

import pandas as pd

from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, Pool
from catboost import EFeaturesSelectionAlgorithm, EShapCalcType

# Select features
Input data has many features (189), which may be too much for modelling. To speed up computations and obtain better scores (by not considering useless features), 50 numerical features are selected. They are concatenated with all 12 categorical features and are used further in models. Features selection is performed via `CatBoost` and `Shapley values`.

## Constants

In [None]:
DATA_PATH = Path("../data/")
RANDOM_SEED = 77
N_JOBS = 16
CONFIG_FILE = Path("../configs/config.yaml")

## Features selection with `Catboost` and `Shapley values`.

In [3]:
df_train = pd.read_parquet(DATA_PATH / "train_preproc_2.parquet")

cat_columns = df_train.drop(columns=["target", "id"]).select_dtypes(int).columns.values.tolist()
num_columns = df_train.drop(columns=["target", "id", "smpl"] + cat_columns).columns.values.tolist()
feat_columns = cat_columns + num_columns

X_train, y_train = df_train[feat_columns], df_train["target"]
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.3, random_state=RANDOM_SEED)

train_pool = Pool(X_train, y_train, cat_features=cat_columns)
val_pool = Pool(X_val, y_val, cat_features=cat_columns)

model = CatBoostClassifier(random_state=RANDOM_SEED, verbose=0, early_stopping_rounds=200, iterations=2500,
                           thread_count=N_JOBS, allow_writing_files=False)

summary = model.select_features(train_pool, eval_set=val_pool,
                                features_for_select=num_columns,
                                num_features_to_select=50,
                                verbose=False,
                                algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
                                shap_calc_type=EShapCalcType.Regular, steps=10)

Learning rate set to 0.086245
Step #1 out of 10
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.1882426112
bestIteration = 343

Shrink model to first 344 iterations.
Feature #147 eliminated
Feature #14 eliminated
Feature #135 eliminated
Feature #33 eliminated
Feature #15 eliminated
Feature #177 eliminated
Feature #20 eliminated
Feature #63 eliminated
Feature #113 eliminated
Feature #60 eliminated
Feature #178 eliminated
Feature #17 eliminated
Feature #75 eliminated
Feature #49 eliminated
Feature #148 eliminated
Feature #72 eliminated
Feature #100 eliminated
Feature #70 eliminated
Feature #46 eliminated
Feature #139 eliminated
Step #2 out of 10
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.1881496844
bestIteration = 538

Shrink model to first 539 iterations.
Feature #95 eliminated
Feature #96 eliminated
Feature #105 eliminated
Feature #140 eliminated
Feature #94 eliminated
Feature #77 eliminated
Feature #112 eliminated
Feature #97 eliminated
F

## Save selected features in a config file.

In [4]:
with CONFIG_FILE.open("a") as f:
    yaml.dump({"selected_features": summary["selected_features_names"]}, f)