In [24]:
import pandas as pd

In [25]:
!pip install catboost



In [26]:
!pip freeze | grep "numpy\|pandas\|scikit-learn"

"grep" �� ���� ����७��� ��� ���譥�
��������, �ᯮ��塞�� �ணࠬ��� ��� ������ 䠩���.


In [27]:
# загружаем данные
train_df = pd.read_parquet("data/train_data.pqt")
test_df = pd.read_parquet("data/test_data.pqt")

In [28]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

In [29]:
# заполняем пропуски в категориальных признаках
for col in cat_cols:
    train_df[col] = train_df[col].astype(str).fillna("missing")
    test_df[col] = test_df[col].astype(str).fillna("missing")

In [30]:
X = train_df.drop(["id", "date", "end_cluster"], axis=1)
y = train_df["end_cluster"]

In [31]:
# отобранные по feature importance признаки

features_new = ['balance_amt_min', 'balance_amt_day_avg', 'channel_code', 'city',
       'city_type', 'index_city_code', 'ogrn_days_end_month',
       'ogrn_days_end_quarter', 'ogrn_month', 'ogrn_year', 'ogrn_exist_months',
       'okved', 'segment', 'cnt_b_oper_1m', 'sum_a_oper_3m', 'cnt_a_oper_3m',
       'cnt_b_oper_3m', 'cnt_c_oper_3m', 'sum_deb_d_oper_3m',
       'cnt_deb_d_oper_3m', 'sum_cred_d_oper_3m', 'cnt_cred_d_oper_3m',
       'sum_cred_e_oper_3m', 'cnt_days_cred_e_oper_3m', 'sum_deb_f_oper_3m',
       'cnt_days_deb_f_oper_3m', 'sum_cred_f_oper_3m', 'cnt_cred_f_oper_3m',
       'cnt_days_cred_f_oper_3m', 'cnt_deb_g_oper_3m',
       'cnt_days_deb_g_oper_3m', 'sum_cred_g_oper_3m', 'cnt_cred_g_oper_3m',
       'cnt_days_cred_g_oper_3m', 'cnt_deb_h_oper_3m',
       'cnt_days_deb_h_oper_3m', 'sum_cred_h_oper_3m', 'cnt_cred_h_oper_3m',
       'cnt_days_cred_h_oper_3m', 'start_cluster']

In [None]:
X = X[features_new].copy()

Перемножение важных признаков между собой может показать модели новые зависимости и связи между признаками, что (чаще всего) положительно сказывается на точности предсказаний.

In [None]:
from catboost import CatBoostClassifier
from itertools import combinations
from sklearn.model_selection import train_test_split

num_cols = X.select_dtypes(include=['number']).columns

# перемножаем числовые признаки
for col1, col2 in combinations(num_cols, 2):
    new_col = f"{col1}_x_{col2}"
    X[new_col] = X[col1] * X[col2]

cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [None]:
X.shape

Обучим CatBoost на новых признаках. Обучение будет длиться очень долго (много данных и признаков), поэтому ограничим размер обучающей выборки и количество итераций. Для определения ключевых признаков модели вполне достаточно 300 итераций.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=100000, random_state=42)

model = CatBoostClassifier(
    iterations=400,
    learning_rate=0.2,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    verbose=50,
    random_state=42,
    task_type='GPU'
)

model.fit(X_train, y_train, cat_features=cat_cols)

In [None]:
# вычисляем важность новых признаков
feature_importances = model.get_feature_importance(prettified=True)
print(feature_importances.sort_values('Importances', ascending=False).head(20))

In [36]:
feature_importances.to_csv("multiply_feature_importances.csv", index=False)

In [37]:
feature_importances

Unnamed: 0,Feature Id,Importances
0,start_cluster,21.720875
1,okved,6.340600
2,segment,5.101557
3,channel_code,4.048093
4,city,3.977410
...,...,...
500,cnt_cred_g_oper_3m_x_cnt_deb_h_oper_3m,0.000000
501,cnt_cred_g_oper_3m_x_cnt_days_deb_h_oper_3m,0.000000
502,cnt_cred_g_oper_3m_x_sum_cred_h_oper_3m,0.000000
503,cnt_days_deb_h_oper_3m_x_cnt_days_cred_h_oper_3m,0.000000
