**Восьмой подход (запускаем гридсерч на отфильтрованном датасете):**

- используем отфильтрованный трейн
- выкидываем выбросы
- запускаем gridsearch для catboost c cv=3 и scoring='roc_auc'
- фильтруем тест через csv со списком признаков после фильтрации
- запускаем predict_proba
- считаем метрики
- закидываем submission_version_8

Импортируем библиотеки

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
import warnings

warnings.simplefilter("ignore")

In [2]:
VERSION = 8

Считываем отфильтрованные данные

In [3]:
X = pd.read_parquet("../../data/intermediate_data/filter_train.parquet")

In [4]:
print(X.shape)

(519615, 228)


In [5]:
X.head()

Unnamed: 0,feature547,feature418,feature867,feature15,feature641,feature734,feature549,feature674,feature442,feature763,...,feature717,feature844,feature380,feature550,feature864,feature727,feature454,feature369,feature367,feature409
0,37,77,7063,6,0,14,17,1,35,0,...,4,1283,0,49,1619,28,0,1,4,56
1,1,77,7063,135,0,14,17,0,35,153,...,4,1283,0,49,2092,28,6,1,4,56
2,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,7174,28,83,1,3,56
3,37,77,7063,0,0,14,17,0,35,1,...,4,1283,0,49,7174,28,0,1,2,56
4,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,1439,28,1,2,2,56


In [6]:
X.target.value_counts()

target
0    501078
1     18537
Name: count, dtype: int64

In [7]:
y = X["target"]
X.drop(columns = ["target", "id"], inplace = True)

In [8]:
y.tail(3)

519612    0
519613    0
519614    1
Name: target, dtype: int64

In [9]:
X.columns

Index(['feature547', 'feature418', 'feature867', 'feature15', 'feature641',
       'feature734', 'feature549', 'feature674', 'feature442', 'feature763',
       ...
       'feature717', 'feature844', 'feature380', 'feature550', 'feature864',
       'feature727', 'feature454', 'feature369', 'feature367', 'feature409'],
      dtype='object', length=226)

Разделяем данные на обучающую и тестовую выборки

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=23)

In [11]:
catboost_parameters = {
    'depth'         : [5, 7, 9],
    'learning_rate' : [0.05, 0.1, 0.2],
    'iterations'    : [250, 500]
}

In [12]:
grid_search_ct = GridSearchCV(estimator=CatBoostClassifier(),
                              param_grid=catboost_parameters,
                              scoring='roc_auc',
                              cv=3,
                              n_jobs=5,
                              verbose=2)

In [13]:
grid_search_ct.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
0:	learn: 0.6188409	total: 188ms	remaining: 46.7s
1:	learn: 0.5551718	total: 241ms	remaining: 29.9s
2:	learn: 0.5002224	total: 303ms	remaining: 25s
3:	learn: 0.4530168	total: 380ms	remaining: 23.4s
4:	learn: 0.4127111	total: 493ms	remaining: 24.2s
5:	learn: 0.3771278	total: 567ms	remaining: 23.1s
6:	learn: 0.3472244	total: 669ms	remaining: 23.2s
7:	learn: 0.3210178	total: 814ms	remaining: 24.6s
8:	learn: 0.2987744	total: 875ms	remaining: 23.4s
9:	learn: 0.2796914	total: 1.01s	remaining: 24.2s
10:	learn: 0.2634303	total: 1.12s	remaining: 24.4s
11:	learn: 0.2493089	total: 1.18s	remaining: 23.4s
12:	learn: 0.2371590	total: 1.24s	remaining: 22.6s
13:	learn: 0.2256251	total: 1.31s	remaining: 22.2s
14:	learn: 0.2164645	total: 1.42s	remaining: 22.3s
15:	learn: 0.2082145	total: 1.58s	remaining: 23.1s
16:	learn: 0.2007908	total: 1.67s	remaining: 22.9s
17:	learn: 0.1944733	total: 1.81s	remaining: 23.4s
18:	learn: 0.1893219	total: 1.92s

In [14]:
print("the best estimator:\n", grid_search_ct.best_estimator_)
print("the best score:\n", grid_search_ct.best_score_)
print("the best parameters:\n", grid_search_ct.best_params_)

the best estimator:
 <catboost.core.CatBoostClassifier object at 0x7fa77776b550>
the best score:
 0.71081324266083
the best parameters:
 {'depth': 5, 'iterations': 500, 'learning_rate': 0.05}


In [15]:
pred = grid_search_ct.predict_proba(X_test)
pred = pred[:, 1]

In [16]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary = (pred >= threshold)
    
    print("\tthreshold:", threshold)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary))
    print("\tPRECISION:", precision_score(y_test, pred_binary))
    print("\tRECALL:", recall_score(y_test, pred_binary))
    print("\tROC_AUC:", roc_auc_score(y_test, pred))
    print()

	threshold: 0.1
	F1_SCORE: 0.15724991078862852
	PRECISION: 0.14063829787234042
	RECALL: 0.17831130294038305
	ROC_AUC: 0.708316055663441

	threshold: 0.3
	F1_SCORE: 0.015380535666931849
	PRECISION: 0.453125
	RECALL: 0.007823037496628
	ROC_AUC: 0.708316055663441

	threshold: 0.5
	F1_SCORE: 0.0
	PRECISION: 0.0
	RECALL: 0.0
	ROC_AUC: 0.708316055663441



In [17]:
best_threshold = 0.1

Сохраняем submission

In [18]:
X_test_submit = pd.read_parquet("../../data/input_data/test_sber.parquet")

In [19]:
print(X_test_submit.shape)

(173433, 1078)


In [20]:
X_test_submit.head()

Unnamed: 0,id,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,3,3,1696,458,26,102479,22,16,0,121,...,779,7740,9577,254,0,355,308,779,7740,9577
1,4,3,1688,53,78,103922,191,64,0,0,...,79401,109240,153820,24766,48600,46029,65113,79401,109240,153820
2,12,3,1689,13,81,104111,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,3,1761,1759,44,102433,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,3,1761,1759,77,102010,191,34,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
filtered_columns = pd.read_csv("../../data/intermediate_data/cols_after_preprocessing.csv")

In [22]:
cols = list(set(filtered_columns.cols_after_preprocessing.values) & set(X_test_submit.columns))

In [23]:
X_filtered = X_test_submit[cols]

In [24]:
X_filtered.drop(columns=["id"], inplace=True)

In [25]:
print(X_filtered.shape)

(173433, 226)


In [26]:
X_filtered.head()

Unnamed: 0,feature338,feature462,feature441,feature90,feature770,feature420,feature857,feature498,feature446,feature812,...,feature553,feature515,feature103,feature513,feature378,feature836,feature717,feature831,feature495,feature17
0,1574,54,2,78982,5,54,23,17,6,1664,...,16,3,213300,9,23,1289,4,1306,0,1
1,2046,0,113,0,5,54,1605,17,2,1656,...,10,3,322921,8,7,1289,4,0,0,1
2,2046,54,0,0,5,54,1605,17,15,1657,...,20,4,565716,12,39,1289,4,1306,3,0
3,2046,0,113,0,5,54,1605,17,14,1729,...,4,448,340738,0,24,1289,4,1306,1,135
4,2046,54,5,165865,5,54,1605,17,100,1729,...,6,2,328274,12,26,1289,4,1306,1,72


In [28]:
pred = grid_search_ct.predict_proba(X_filtered)

pred = pred[:, 1]
pred_binary = (pred >= best_threshold).astype(int)

In [29]:
submission = pd.read_csv("../../data/intermediate_data/sample_submission.csv")
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03
3,16,1,0.03
4,20,0,0.03
5,23,0,0.03
6,26,0,0.03
7,50,0,0.03
8,51,1,0.03
9,53,0,0.03


In [30]:
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.033415
1,4,0,0.01637
2,12,0,0.018259
3,16,0,0.019318
4,20,0,0.058683
5,23,0,0.014014
6,26,0,0.023644
7,50,0,0.035147
8,51,0,0.062097
9,53,0,0.019231


In [31]:
submission.target_bin.value_counts()

target_bin
0    162759
1     10674
Name: count, dtype: int64

In [None]:
submission.to_csv(f"../../data/output_data/submission_version_{VERSION}_{best_threshold}.csv", index=False)

---