In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
seed = 322

## Датасеты



Читаем трейн и тест


In [3]:
df_path = 'datasets/FIIT/train.csv'
df = pd.read_csv(df_path).drop(columns=['ID'])

In [None]:
test_path = 'datasets/FIIT/data_predict.csv'
test_df = pd.read_csv(test_path)
test_df_id = test_df['ID']
test_df = test_df.drop(columns=['ID'])

Убираем полные колонки nan


In [4]:
df = df.dropna(axis=1, how='all')
test_df = test_df.dropna(axis=1, how='all')

Отделяем метки и приводим их в стандартный вид


In [5]:
X = df.drop(columns=['Target'])
y = df['Target']
y = (y + 1) / 2

Создание обучающей подвыборки


In [6]:
X_train_cat, X_val_cat, y_train, y_val = train_test_split(X, y, 
                                                          test_size=0.2, 
                                                          random_state=seed, 
                                                          stratify=y)

Находим категориальные фичи, указав на них в pool, catboost их будет по умному обрабатывать



In [7]:
is_cat = (X.dtypes != float)
cat_features_index = np.where(is_cat)[0]

In [8]:
train_pool = Pool(data=X_train_cat, label=y_train, cat_features=cat_features_index)
val_pool = Pool(data=X_val_cat, label=y_val, cat_features=cat_features_index)

## Моделька



Оптимальные параметры из optune


In [9]:
params_cat = {'max_depth': 8,
              'learning_rate': 0.024986069141434246,
              'l2_leaf_reg': 2.964950034617931,
              "loss_function": "Logloss",
              'eval_metric': 'AUC',
              'border_count': 131,
              'random_seed': seed,
              'verbose': False}

In [10]:
cat_clf = CatBoostClassifier(**params_cat)

ТренировОчка

In [11]:
cat_clf.fit(X=train_pool, use_best_model=True, eval_set=val_pool)

<catboost.core.CatBoostClassifier at 0x7f91202897f0>

Проверка на тесте


In [15]:
roc_auc_score(y_val, cat_clf.predict_proba(X_val_cat)[:, 1])

0.7600558280591887

## Создание submission


In [12]:
y_pred_cat = pd.Series(cat_clf.predict_proba(test_df)[:, 1])

In [13]:
submission = pd.DataFrame({'ID': test_df_id.to_list(), 'Target': y_pred_cat})
submission.to_csv('submission.csv', index=False)