**Седьмой подход (запускаем гридсерч на отфильтрованном датасете + выбросы):**

- используем отфильтрованный трейн
- выкидываем выбросы
- запускаем gridsearch для catboost c cv=3 и scoring='roc_auc'
- фильтруем тест через csv со списком признаков после фильтрации 
- запускаем predict_proba
- считаем метрики
- закидываем submission_version_7

Импортируем библиотеки

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
import warnings

warnings.simplefilter("ignore")

In [36]:
VERSION = 7

Считываем отфильтрованные данные

In [37]:
X = pd.read_parquet("../../data/intermediate_data/filter_train.parquet")

In [38]:
print(X.shape)

(519615, 228)


In [39]:
X.head()

Unnamed: 0,feature547,feature418,feature867,feature15,feature641,feature734,feature549,feature674,feature442,feature763,...,feature717,feature844,feature380,feature550,feature864,feature727,feature454,feature369,feature367,feature409
0,37,77,7063,6,0,14,17,1,35,0,...,4,1283,0,49,1619,28,0,1,4,56
1,1,77,7063,135,0,14,17,0,35,153,...,4,1283,0,49,2092,28,6,1,4,56
2,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,7174,28,83,1,3,56
3,37,77,7063,0,0,14,17,0,35,1,...,4,1283,0,49,7174,28,0,1,2,56
4,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,1439,28,1,2,2,56


In [40]:
X.target.value_counts()

target
0    501078
1     18537
Name: count, dtype: int64

In [41]:
y = X["target"]
X.drop(columns = ["target", "id"], inplace = True)

In [42]:
y.tail(3)

519612    0
519613    0
519614    1
Name: target, dtype: int64

In [44]:
def preprocessing_outliers(data: pd.DataFrame, g_m_int: int = 0.05, q_p_int: int = 0.95) -> pd.DataFrame:
    features = data.columns
    
    for feature in features:
        q_m = data[feature].quantile(g_m_int)
        q_p = data[feature].quantile(q_p_int)
        
        data.loc[data[feature] < q_m, feature] = q_m
        data.loc[data[feature] > q_p, feature] = q_p
        
    return data

In [45]:
X_wo_outliers = preprocessing_outliers(X)

Разделяем данные на обучающую и тестовую выборки

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X_wo_outliers,
                                                    y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=23)

In [47]:
catboost_parameters = {
    'depth'         : [5, 7, 9],
    'learning_rate' : [0.05, 0.1, 0.2],
    'iterations'    : [250, 500]
}

In [48]:
grid_search_ct = GridSearchCV(estimator=CatBoostClassifier(),
                              param_grid=catboost_parameters,
                              scoring='roc_auc',
                              cv=3,
                              n_jobs=-1,
                              verbose=2)

In [49]:
grid_search_ct.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
0:	learn: 0.5515880	total: 2.44s	remaining: 20m 15s
1:	learn: 0.4470928	total: 4.37s	remaining: 18m 7s
2:	learn: 0.3714321	total: 7.14s	remaining: 19m 43s
3:	learn: 0.3157114	total: 9.3s	remaining: 19m 13s
4:	learn: 0.2747326	total: 11.9s	remaining: 19m 38s
5:	learn: 0.2443620	total: 13.9s	remaining: 19m
6:	learn: 0.2218791	total: 16.2s	remaining: 19m 2s
7:	learn: 0.2044886	total: 17.3s	remaining: 17m 42s
8:	learn: 0.1919425	total: 18.5s	remaining: 16m 49s
9:	learn: 0.1818183	total: 20.9s	remaining: 17m 6s
10:	learn: 0.1740936	total: 24.6s	remaining: 18m 12s
11:	learn: 0.1684043	total: 27.6s	remaining: 18m 43s
12:	learn: 0.1634409	total: 29.4s	remaining: 18m 21s
13:	learn: 0.1600649	total: 31.8s	remaining: 18m 23s
14:	learn: 0.1572586	total: 36s	remaining: 19m 24s
15:	learn: 0.1549998	total: 38.9s	remaining: 19m 37s
16:	learn: 0.1531546	total: 42.5s	remaining: 20m 6s
17:	learn: 0.1517902	total: 46.7s	remaining: 20m 49s
18:	le

In [50]:
print("the best estimator:\n", grid_search_ct.best_estimator_)
print("the best score:\n", grid_search_ct.best_score_)
print("the best parameters:\n", grid_search_ct.best_params_)

the best estimator:
 <catboost.core.CatBoostClassifier object at 0x7f1e03ed7c90>
the best score:
 0.7112985128749217
the best parameters:
 {'depth': 5, 'iterations': 500, 'learning_rate': 0.05}


In [51]:
pred = grid_search_ct.predict_proba(X_test)
pred = pred[:, 1]

In [52]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary = (pred >= threshold)
    
    print("\tthreshold:", threshold)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary))
    print("\tPRECISION:", precision_score(y_test, pred_binary))
    print("\tRECALL:", recall_score(y_test, pred_binary))
    print("\tROC_AUC:", roc_auc_score(y_test, pred))
    print()

	threshold: 0.1
	F1_SCORE: 0.15344603381014305
	PRECISION: 0.13657407407407407
	RECALL: 0.17507418397626112
	ROC_AUC: 0.7066721826363553

	threshold: 0.3
	F1_SCORE: 0.009083622762489982
152:	learn: 0.1275682	total: 11m 43s	remaining: 7m 26s
153:	learn: 0.1274683	total: 11m 47s	remaining: 7m 20s
154:	learn: 0.1273708	total: 11m 51s	remaining: 7m 16s
155:	learn: 0.1273458	total: 11m 53s	remaining: 7m 10s
156:	learn: 0.1272447	total: 11m 57s	remaining: 7m 4s
157:	learn: 0.1271210	total: 12m 3s	remaining: 7m 1s
158:	learn: 0.1270325	total: 12m 7s	remaining: 6m 56s
159:	learn: 0.1269356	total: 12m 13s	remaining: 6m 52s
160:	learn: 0.1268346	total: 12m 18s	remaining: 6m 48s
161:	learn: 0.1267386	total: 12m 25s	remaining: 6m 44s
162:	learn: 0.1266195	total: 12m 30s	remaining: 6m 40s
163:	learn: 0.1265167	total: 12m 34s	remaining: 6m 35s
164:	learn: 0.1264117	total: 12m 37s	remaining: 6m 30s
165:	learn: 0.1263052	total: 12m 40s	remaining: 6m 25s
166:	learn: 0.1262023	total: 12m 46s	remaining: 

In [53]:
best_threshold = 0.1

Сохраняем submission

In [54]:
X_test_submit = pd.read_parquet("../../data/input_data/test_sber.parquet")

In [55]:
print(X_test_submit.shape)

(173433, 1078)


In [56]:
X_test_submit.head()

Unnamed: 0,id,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,3,3,1696,458,26,102479,22,16,0,121,...,779,7740,9577,254,0,355,308,779,7740,9577
1,4,3,1688,53,78,103922,191,64,0,0,...,79401,109240,153820,24766,48600,46029,65113,79401,109240,153820
2,12,3,1689,13,81,104111,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,3,1761,1759,44,102433,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,3,1761,1759,77,102010,191,34,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
filtered_columns = pd.read_csv("../../data/intermediate_data/cols_after_preprocessing.csv")

In [58]:
cols = list(set(filtered_columns.cols_after_preprocessing.values) & set(X_test_submit.columns))

In [59]:
X_filtered = X_test_submit[cols]

In [60]:
X_filtered.drop(columns=["id"], inplace=True)

In [61]:
print(X_filtered.shape)

(173433, 226)


In [62]:
X_filtered.head()

Unnamed: 0,feature500,feature458,feature752,feature423,feature681,feature641,feature502,feature776,feature488,feature416,...,feature457,feature17,feature337,feature440,feature429,feature766,feature514,feature764,feature782,feature495
0,147,119,17,17,1,0,1,92,58,34,...,129,1,1332,217,2,14,87,57,42,0
1,1,2,17,17,0,0,103,92,75,34,...,129,1,1332,0,1,14,1,57,42,0
2,0,119,17,17,0,0,1,92,142,34,...,129,0,1332,2,2,14,87,0,0,3
3,0,119,17,17,0,0,103,92,115,34,...,129,135,1332,217,2,14,87,57,42,1
4,147,119,17,17,0,0,2,92,165,34,...,129,72,1332,217,1,14,87,57,42,1


In [63]:
X_filtered_wo_outliers = preprocessing_outliers(X_filtered)

In [64]:
pred = grid_search_ct.predict_proba(X_filtered_wo_outliers)

pred = pred[:, 1]
pred_binary = (pred >= best_threshold).astype(int)

In [65]:
submission = pd.read_csv("../../data/intermediate_data/sample_submission.csv")
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03
3,16,1,0.03
4,20,0,0.03
5,23,0,0.03
6,26,0,0.03
7,50,0,0.03
8,51,1,0.03
9,53,0,0.03


In [66]:
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.039561
1,4,0,0.015499
2,12,0,0.016449
3,16,0,0.016202
4,20,0,0.056869
5,23,0,0.012634
6,26,0,0.02212
7,50,0,0.034857
8,51,0,0.041225
9,53,0,0.022707


In [67]:
submission.target_bin.value_counts()

target_bin
0    162836
1     10597
Name: count, dtype: int64

In [68]:
submission.to_csv(f"../../data/output_data/submission_version_{VERSION}_{best_threshold}.csv", index=False)

---