### Загрузка библиотек и данных, ограничение ресурсов

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES']='2'
os.environ['CUDA_DEVICE_ORDER']="PCI_BUS_ID"

In [2]:
import pandas as pd
import numpy as np

In [165]:
# preprocessed датасеты содержат разделенные столбцы:
#     'oper_type + oper_attr' -> 'oper_type' и 'oper_attr', 
#     'index_oper' -> 'index_reg' и 'index_loc' - первая тройка цифр индекса и вторая - соответственно.
# Так же обозначены типы категориальных данных - тех признаков, для которых количество уникальных значений <= 1000
train_df = pd.read_csv('../data/preprocessed_train.csv')
test_df = pd.read_csv('../data/preprocessed_test.csv')

In [164]:
# был эксперимент по группированию категорий с редкой встречаемостью в отдельную категорию для уменьшения сложности данных, 
# однако сильного прироста это не дало
# train_df_2 = pd.read_csv('../data/train_dataset_rares.csv')
# test_df_2 = pd.read_csv('../data/test_dataset_rares.csv')

  test_df_2 = pd.read_csv('../data/test_dataset_rares.csv')


Разбиение размеченного датасета на обучающую и тестовую часть. Так как в тренировочном датасете примеров с label==0 значительно больше, чем label==1, подсчитываем значение class_weight и каждому из примеров присваиваем вес его класса.

In [138]:
from sklearn.model_selection import train_test_split

seed = 481516 # 2342
# предобработанный датасет содержит признаки 'oper_type','oper_attr', 'index_reg', 'index_loc'. 
# При обучении моделей с этими признаками наблюдается явное переобучение. 
# Значение метрики растет на тестовой подвыборке, при этом значение метрики на публичной части падает.
X_train, X_test = train_test_split(train_df.drop(columns=['oper_type','oper_attr', 'index_reg', 'index_loc']), test_size=.1, random_state=seed)

In [132]:
from sklearn.utils.class_weight import compute_class_weight
class_weight = compute_class_weight('balanced', classes=[0,1], y=train_df.label)
print(class_weight)

[ 0.51466627 17.5459118 ]


In [133]:
sample_weight = [class_weight[i] for i in X_train.label]

### Обучение моделей *random forest, catboost, xgboost* и *lgbm* с автоматическим подбором гиперпараметров с помощью библиотеки flaml

In [200]:
from flaml import AutoML

automl_lgbm = AutoML(n_jobs=-1)

settings = {
    "time_budget": 5800,  # total running time in seconds
    "estimator_list": ['lgbm'],  # list of ML learners; we tune XGBoost in this example
    "task": 'classification',  # task type
    "log_file_name": 'pochta.log',  # flaml log file
    "seed": seed,    # random seed
    'sample_weight': np.squeeze(sample_weight)
}

automl_lgbm.fit(X_train=X_train.drop(columns='label'), y_train=X_train['label'], **settings)

[flaml.automl: 11-20 00:38:22] {2599} INFO - task = classification
[flaml.automl: 11-20 00:38:22] {2601} INFO - Data split method: stratified
[flaml.automl: 11-20 00:38:22] {2604} INFO - Evaluation method: holdout
[flaml.automl: 11-20 00:38:36] {2726} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl: 11-20 00:38:36] {2870} INFO - List of ML learners in AutoML Run: ['lgbm']
[flaml.automl: 11-20 00:38:36] {3166} INFO - iteration 0, current learner lgbm
[flaml.automl: 11-20 00:38:36] {3296} INFO - Estimated sufficient time budget=4625940s. Estimated necessary time budget=4626s.
[flaml.automl: 11-20 00:38:36] {3343} INFO -  at 102.9s,	estimator lgbm's best error=0.1008,	best estimator lgbm's best error=0.1008
[flaml.automl: 11-20 00:38:36] {3166} INFO - iteration 1, current learner lgbm
[flaml.automl: 11-20 00:38:37] {3343} INFO -  at 103.1s,	estimator lgbm's best error=0.1008,	best estimator lgbm's best error=0.1008
[flaml.automl: 11-20 00:38:37] {3166} INFO - iteration 2, current 

[flaml.automl: 11-20 00:43:54] {3343} INFO -  at 420.0s,	estimator lgbm's best error=0.0501,	best estimator lgbm's best error=0.0501
[flaml.automl: 11-20 00:43:54] {3166} INFO - iteration 37, current learner lgbm
[flaml.automl: 11-20 00:44:09] {3343} INFO -  at 435.2s,	estimator lgbm's best error=0.0501,	best estimator lgbm's best error=0.0501
[flaml.automl: 11-20 00:44:09] {3166} INFO - iteration 38, current learner lgbm
[flaml.automl: 11-20 00:44:27] {3343} INFO -  at 453.1s,	estimator lgbm's best error=0.0501,	best estimator lgbm's best error=0.0501
[flaml.automl: 11-20 00:44:27] {3166} INFO - iteration 39, current learner lgbm
[flaml.automl: 11-20 00:44:41] {3343} INFO -  at 467.3s,	estimator lgbm's best error=0.0501,	best estimator lgbm's best error=0.0501
[flaml.automl: 11-20 00:44:41] {3166} INFO - iteration 40, current learner lgbm
[flaml.automl: 11-20 00:44:53] {3343} INFO -  at 479.3s,	estimator lgbm's best error=0.0501,	best estimator lgbm's best error=0.0501
[flaml.automl: 

[flaml.automl: 11-20 01:04:25] {3166} INFO - iteration 75, current learner lgbm
[flaml.automl: 11-20 01:05:25] {3343} INFO -  at 1711.5s,	estimator lgbm's best error=0.0490,	best estimator lgbm's best error=0.0490
[flaml.automl: 11-20 01:05:25] {3166} INFO - iteration 76, current learner lgbm
[flaml.automl: 11-20 01:05:42] {3343} INFO -  at 1728.4s,	estimator lgbm's best error=0.0490,	best estimator lgbm's best error=0.0490
[flaml.automl: 11-20 01:05:42] {3166} INFO - iteration 77, current learner lgbm
[flaml.automl: 11-20 01:06:43] {3343} INFO -  at 1789.1s,	estimator lgbm's best error=0.0489,	best estimator lgbm's best error=0.0489
[flaml.automl: 11-20 01:06:43] {3166} INFO - iteration 78, current learner lgbm
[flaml.automl: 11-20 01:08:51] {3343} INFO -  at 1917.2s,	estimator lgbm's best error=0.0489,	best estimator lgbm's best error=0.0489
[flaml.automl: 11-20 01:08:51] {3166} INFO - iteration 79, current learner lgbm
[flaml.automl: 11-20 01:09:29] {3343} INFO -  at 1955.3s,	estima

In [201]:
from flaml import AutoML

automl_xgb = AutoML(n_jobs=-1)

settings = {
    "time_budget": 3500,  # total running time in seconds
    "estimator_list": ['xgboost'],  # list of ML learners; we tune XGBoost in this example
    "task": 'classification',  # task type
    "log_file_name": 'pochta.log',  # flaml log file
    "seed": seed,    # random seed
    'sample_weight': np.squeeze(sample_weight)
}

automl_xgb.fit(X_train=X_train.drop(columns='label'), y_train=X_train['label'], **settings)

[flaml.automl: 11-20 02:21:40] {2599} INFO - task = classification
[flaml.automl: 11-20 02:21:40] {2601} INFO - Data split method: stratified
[flaml.automl: 11-20 02:21:40] {2604} INFO - Evaluation method: holdout
[flaml.automl: 11-20 02:21:48] {2726} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl: 11-20 02:21:48] {2870} INFO - List of ML learners in AutoML Run: ['xgboost']
[flaml.automl: 11-20 02:21:48] {3166} INFO - iteration 0, current learner xgboost
[flaml.automl: 11-20 02:21:49] {3296} INFO - Estimated sufficient time budget=6256558s. Estimated necessary time budget=6257s.
[flaml.automl: 11-20 02:21:49] {3343} INFO -  at 37.1s,	estimator xgboost's best error=0.1098,	best estimator xgboost's best error=0.1098
[flaml.automl: 11-20 02:21:49] {3166} INFO - iteration 1, current learner xgboost
[flaml.automl: 11-20 02:21:49] {3343} INFO -  at 37.6s,	estimator xgboost's best error=0.1098,	best estimator xgboost's best error=0.1098
[flaml.automl: 11-20 02:21:49] {3166} INFO - it

[flaml.automl: 11-20 02:22:12] {3343} INFO -  at 60.2s,	estimator xgboost's best error=0.0572,	best estimator xgboost's best error=0.0572
[flaml.automl: 11-20 02:22:12] {3166} INFO - iteration 19, current learner xgboost
[flaml.automl: 11-20 02:22:15] {3343} INFO -  at 62.8s,	estimator xgboost's best error=0.0552,	best estimator xgboost's best error=0.0552
[flaml.automl: 11-20 02:22:15] {3166} INFO - iteration 20, current learner xgboost
[flaml.automl: 11-20 02:22:27] {3343} INFO -  at 74.9s,	estimator xgboost's best error=0.0531,	best estimator xgboost's best error=0.0531
[flaml.automl: 11-20 02:22:27] {3166} INFO - iteration 21, current learner xgboost
[flaml.automl: 11-20 02:22:28] {3343} INFO -  at 76.1s,	estimator xgboost's best error=0.0531,	best estimator xgboost's best error=0.0531
[flaml.automl: 11-20 02:22:28] {3166} INFO - iteration 22, current learner xgboost
[flaml.automl: 11-20 02:22:44] {3343} INFO -  at 91.7s,	estimator xgboost's best error=0.0531,	best estimator xgboos

[flaml.automl: 11-20 02:41:39] {3343} INFO -  at 1227.3s,	estimator xgboost's best error=0.0493,	best estimator xgboost's best error=0.0493
[flaml.automl: 11-20 02:41:39] {3166} INFO - iteration 39, current learner xgboost
[flaml.automl: 11-20 02:43:56] {3343} INFO -  at 1364.0s,	estimator xgboost's best error=0.0493,	best estimator xgboost's best error=0.0493
[flaml.automl: 11-20 02:43:56] {3166} INFO - iteration 40, current learner xgboost
[flaml.automl: 11-20 02:44:58] {3343} INFO -  at 1426.1s,	estimator xgboost's best error=0.0492,	best estimator xgboost's best error=0.0492
[flaml.automl: 11-20 02:44:58] {3166} INFO - iteration 41, current learner xgboost
[flaml.automl: 11-20 02:45:23] {3343} INFO -  at 1450.8s,	estimator xgboost's best error=0.0492,	best estimator xgboost's best error=0.0492
[flaml.automl: 11-20 02:45:23] {3166} INFO - iteration 42, current learner xgboost
[flaml.automl: 11-20 02:47:36] {3343} INFO -  at 1584.4s,	estimator xgboost's best error=0.0489,	best estima

In [202]:
from flaml import AutoML

automl_cb = AutoML(n_jobs=-1)

settings = {
    "time_budget": 4800,  # total running time in seconds
    "estimator_list": ['catboost'],  # list of ML learners; we tune XGBoost in this example
    "task": 'classification',  # task type
    "log_file_name": 'pochta.log',  # flaml log file
    "seed": seed,    # random seed
    'sample_weight': np.squeeze(sample_weight)
}

automl_cb.fit(X_train=X_train.drop(columns='label'), y_train=X_train['label'], **settings)

[flaml.automl: 11-20 03:22:57] {2599} INFO - task = classification
[flaml.automl: 11-20 03:22:57] {2601} INFO - Data split method: stratified
[flaml.automl: 11-20 03:22:57] {2604} INFO - Evaluation method: holdout
[flaml.automl: 11-20 03:23:03] {2726} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl: 11-20 03:23:03] {2870} INFO - List of ML learners in AutoML Run: ['catboost']
[flaml.automl: 11-20 03:23:03] {3166} INFO - iteration 0, current learner catboost
[flaml.automl: 11-20 03:23:07] {3296} INFO - Estimated sufficient time budget=19603879s. Estimated necessary time budget=19604s.
[flaml.automl: 11-20 03:23:07] {3343} INFO -  at 44.8s,	estimator catboost's best error=0.0903,	best estimator catboost's best error=0.0903
[flaml.automl: 11-20 03:23:07] {3166} INFO - iteration 1, current learner catboost
[flaml.automl: 11-20 03:23:13] {3343} INFO -  at 50.0s,	estimator catboost's best error=0.0756,	best estimator catboost's best error=0.0756
[flaml.automl: 11-20 03:23:13] {3166} 

[flaml.automl: 11-20 03:36:49] {3343} INFO -  at 866.8s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 03:36:49] {3166} INFO - iteration 35, current learner catboost
[flaml.automl: 11-20 03:36:53] {3343} INFO -  at 870.2s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 03:36:53] {3166} INFO - iteration 36, current learner catboost
[flaml.automl: 11-20 03:36:56] {3343} INFO -  at 873.6s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 03:36:56] {3166} INFO - iteration 37, current learner catboost
[flaml.automl: 11-20 03:37:05] {3343} INFO -  at 881.9s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 03:37:05] {3166} INFO - iteration 38, current learner catboost
[flaml.automl: 11-20 03:37:10] {3343} INFO -  at 887.5s,	estimator catboost's best error=0.0507,	bes

[flaml.automl: 11-20 03:55:18] {3166} INFO - iteration 71, current learner catboost
[flaml.automl: 11-20 03:55:24] {3343} INFO -  at 1980.9s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 03:55:24] {3166} INFO - iteration 72, current learner catboost
[flaml.automl: 11-20 03:55:58] {3343} INFO -  at 2015.3s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 03:55:58] {3166} INFO - iteration 73, current learner catboost
[flaml.automl: 11-20 03:56:04] {3343} INFO -  at 2021.5s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 03:56:04] {3166} INFO - iteration 74, current learner catboost
[flaml.automl: 11-20 03:56:14] {3343} INFO -  at 2031.5s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 03:56:14] {3166} INFO - iteration 75, current learner catboost
[flaml.autom

[flaml.automl: 11-20 04:10:16] {3343} INFO -  at 2873.5s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:10:16] {3166} INFO - iteration 108, current learner catboost
[flaml.automl: 11-20 04:10:18] {3343} INFO -  at 2875.8s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:10:18] {3166} INFO - iteration 109, current learner catboost
[flaml.automl: 11-20 04:10:27] {3343} INFO -  at 2884.1s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:10:27] {3166} INFO - iteration 110, current learner catboost
[flaml.automl: 11-20 04:10:29] {3343} INFO -  at 2886.3s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:10:29] {3166} INFO - iteration 111, current learner catboost
[flaml.automl: 11-20 04:10:34] {3343} INFO -  at 2891.8s,	estimator catboost's best error=0.

[flaml.automl: 11-20 04:23:25] {3166} INFO - iteration 144, current learner catboost
[flaml.automl: 11-20 04:23:27] {3343} INFO -  at 3663.9s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:23:27] {3166} INFO - iteration 145, current learner catboost
[flaml.automl: 11-20 04:23:28] {3343} INFO -  at 3665.4s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:23:28] {3166} INFO - iteration 146, current learner catboost
[flaml.automl: 11-20 04:23:33] {3343} INFO -  at 3669.9s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:23:33] {3166} INFO - iteration 147, current learner catboost
[flaml.automl: 11-20 04:23:57] {3343} INFO -  at 3694.9s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:23:57] {3166} INFO - iteration 148, current learner catboost
[flaml.

[flaml.automl: 11-20 04:33:49] {3343} INFO -  at 4286.7s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:33:49] {3166} INFO - iteration 181, current learner catboost
[flaml.automl: 11-20 04:34:14] {3343} INFO -  at 4311.8s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:34:14] {3166} INFO - iteration 182, current learner catboost
[flaml.automl: 11-20 04:34:15] {3343} INFO -  at 4312.6s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:34:15] {3166} INFO - iteration 183, current learner catboost
[flaml.automl: 11-20 04:34:16] {3343} INFO -  at 4313.4s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:34:16] {3166} INFO - iteration 184, current learner catboost
[flaml.automl: 11-20 04:34:17] {3343} INFO -  at 4314.2s,	estimator catboost's best error=0.

[flaml.automl: 11-20 04:37:41] {3166} INFO - iteration 217, current learner catboost
[flaml.automl: 11-20 04:38:01] {3343} INFO -  at 4538.1s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:38:01] {3166} INFO - iteration 218, current learner catboost
[flaml.automl: 11-20 04:38:26] {3343} INFO -  at 4563.7s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:38:26] {3166} INFO - iteration 219, current learner catboost
[flaml.automl: 11-20 04:38:27] {3343} INFO -  at 4564.3s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:38:27] {3166} INFO - iteration 220, current learner catboost
[flaml.automl: 11-20 04:38:27] {3343} INFO -  at 4564.8s,	estimator catboost's best error=0.0507,	best estimator catboost's best error=0.0507
[flaml.automl: 11-20 04:38:27] {3166} INFO - iteration 221, current learner catboost
[flaml.

In [203]:
from flaml import AutoML

# automl_rf = AutoML(n_jobs=-1)

settings = {
    "time_budget": 1800,  # total running time in seconds
    "estimator_list": ['rf'],  # list of ML learners; we tune XGBoost in this example
    "task": 'classification',  # task type
    "log_file_name": 'pochta.log',  # flaml log file
    "seed": seed,    # random seed
    'sample_weight': np.squeeze(sample_weight)
}

automl_rf.fit(X_train=X_train.drop(columns='label'), y_train=X_train['label'], **settings)

[flaml.automl: 11-20 04:40:02] {2599} INFO - task = classification
[flaml.automl: 11-20 04:40:02] {2601} INFO - Data split method: stratified
[flaml.automl: 11-20 04:40:02] {2604} INFO - Evaluation method: holdout
[flaml.automl: 11-20 04:40:08] {2726} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl: 11-20 04:40:08] {2870} INFO - List of ML learners in AutoML Run: ['rf']
[flaml.automl: 11-20 04:40:08] {3166} INFO - iteration 0, current learner rf
[flaml.automl: 11-20 04:40:09] {3296} INFO - Estimated sufficient time budget=1436063s. Estimated necessary time budget=1436s.
[flaml.automl: 11-20 04:40:09] {3343} INFO -  at 34.4s,	estimator rf's best error=0.1183,	best estimator rf's best error=0.1183
[flaml.automl: 11-20 04:40:09] {3166} INFO - iteration 1, current learner rf
[flaml.automl: 11-20 04:40:09] {3343} INFO -  at 34.8s,	estimator rf's best error=0.1183,	best estimator rf's best error=0.1183
[flaml.automl: 11-20 04:40:09] {3166} INFO - iteration 2, current learner rf
[flam

[flaml.automl: 11-20 04:41:22] {3343} INFO -  at 108.3s,	estimator rf's best error=0.0532,	best estimator rf's best error=0.0532
[flaml.automl: 11-20 04:41:22] {3166} INFO - iteration 38, current learner rf
[flaml.automl: 11-20 04:41:26] {3343} INFO -  at 112.1s,	estimator rf's best error=0.0532,	best estimator rf's best error=0.0532
[flaml.automl: 11-20 04:41:26] {3166} INFO - iteration 39, current learner rf
[flaml.automl: 11-20 04:41:37] {3343} INFO -  at 122.4s,	estimator rf's best error=0.0528,	best estimator rf's best error=0.0528
[flaml.automl: 11-20 04:41:37] {3166} INFO - iteration 40, current learner rf
[flaml.automl: 11-20 04:41:43] {3343} INFO -  at 128.8s,	estimator rf's best error=0.0528,	best estimator rf's best error=0.0528
[flaml.automl: 11-20 04:41:43] {3166} INFO - iteration 41, current learner rf
[flaml.automl: 11-20 04:41:52] {3343} INFO -  at 137.9s,	estimator rf's best error=0.0526,	best estimator rf's best error=0.0526
[flaml.automl: 11-20 04:41:52] {3166} INFO 

### Получаем предсказания на тестовой выборке для подсчета метрики качества. Так же получаем взвешенную сумму предсказаний.

In [205]:
pr_lgbm = automl_lgbm.predict_proba(X_test.drop(columns='label'))[:,1]
pr_xgb = automl_xgb.predict_proba(X_test.drop(columns='label'))[:,1]
pr_cb = automl_cb.predict_proba(X_test.drop(columns='label'))[:,1]
pr_rf = automl_rf.predict_proba(X_test.drop(columns='label'))[:,1]

In [209]:
sumlist = [
    pr_lgbm,
    pr_xgb,
#     pr_cb,
    pr_rf
]

weighted = np.sum(sumlist, axis=0)/len(sumlist)

### Посмотрим зависимость значения метрики качества на тестовой выборке от трешхолда 

In [282]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, roc_auc_score

def compete_score(y_true, y_pred):
    return 0.1* recall_score(y_true, y_pred) + 0.9*roc_auc_score(y_true, y_pred)

for thr in np.arange(0.1,1., .1):
    score_lgbm = compete_score(X_test['label'], pr_lgbm > thr)
    score_xgb = compete_score(X_test['label'], pr_xgb > thr)
    score_cb = compete_score(X_test['label'], pr_cb > thr)
    score_rf = compete_score(X_test['label'], pr_rf > thr)

    score_weighted = compete_score(X_test['label'], weighted > thr)
    print(round(thr, 3), list(map(lambda x: round(x,5), [score_lgbm, score_xgb, score_cb, score_rf, score_weighted])))

0.1 [0.85876, 0.85476, 0.83108, 0.85169, 0.85473]
0.2 [0.87884, 0.87827, 0.84597, 0.87844, 0.87854]
0.3 [0.88838, 0.88766, 0.87013, 0.8877, 0.88811]
0.4 [0.89235, 0.89258, 0.86663, 0.89185, 0.89278]
0.5 [0.8897, 0.89026, 0.87518, 0.89015, 0.89099]
0.6 [0.87811, 0.8815, 0.85212, 0.88263, 0.88201]
0.7 [0.84743, 0.84879, 0.83075, 0.84417, 0.84611]
0.8 [0.80153, 0.80107, 0.74222, 0.79782, 0.79905]
0.9 [0.72827, 0.72817, 0.45, 0.72867, 0.72624]


thr = .4 дает наивысшее значение метрики качества на тестовой выборке. Однако понижение значения thr до .3 дало лучший результат на публичной выборке.

### Подготавливаем submission.csv

In [214]:
# получение предсказаний всеми моделями на итоговой выборке 
ip_lgbm = automl_lgbm.predict_proba(test_df)[:,1]
ip_xgb = automl_xgb.predict_proba(test_df)[:,1]
ip_cb = automl_cb.predict_proba(test_df)[:,1]
ip_rf = automl_rf.predict_proba(test_df)[:,1]

In [288]:
# thr = .27798
thr = .35
(ip_lgbm>thr).sum(), (ip_xgb>thr).sum(), (ip_cb>thr).sum(), (ip_rf>thr).sum(), ((ip_lgbm + ip_xgb + ip_cb + ip_rf)/4>thr).sum(), ((ip_lgbm+ip_xgb + ip_rf)/3>thr).sum()

(854460, 785871, 1066113, 853019, 882086, 834242)

In [None]:
x_infer_id = pd.read_csv('../data/test_dataset_test.csv')['id']

In [286]:
submission = pd.DataFrame({'id': x_infer_id, 'label': ((ip_lgbm+ip_xgb + ip_cb+ip_rf)/4>thr).astype(int)})
submission['label'].sum()

1303426

In [287]:
submission.to_csv(f'../solutions/last-hours-lgbm-cv-xgb-rf-{thr}.csv', index=False)