**15 подход (запускаем гридсерч на изначальном датасете + выбросы):**

- используем трейн
- выкидываем выбросы
- запускаем gridsearch для catboost c cv=3 и scoring='roc_auc'
- фильтруем тест через csv со списком признаков после фильтрации 
- запускаем predict_proba
- считаем метрики
- закидываем submission_version_15

Импортируем библиотеки

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
import warnings

warnings.simplefilter("ignore")

In [35]:
VERSION = 15

Считываем отфильтрованные данные

In [3]:
X = pd.read_parquet("../../data/input_data/train_ai_comp_final_dp.parquet")

In [4]:
print(X.shape)

(519615, 1079)


In [5]:
X.head()

Unnamed: 0,id,target,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,0,0,1,1761,1759,85,105469,191,46,0,...,28913,48985,84264,12045,12107,12510,21126,28913,48985,84264
1,1,0,1,1761,1759,120,105610,144,71,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,1,890,1759,141,105227,191,11,0,...,55949,70006,113317,33735,31242,12691,48682,55949,70006,113317
5,5,0,1,1599,966,30,102441,191,8,0,...,0,6872,8530,0,0,0,0,0,6872,8530
6,6,0,1,1761,1759,85,104006,191,2,0,...,0,0,7642,0,0,0,0,0,0,7642


In [6]:
X.target.value_counts()

target
0    501078
1     18537
Name: count, dtype: int64

In [7]:
y = X["target"]
X.drop(columns = ["target", "id"], inplace = True)

In [8]:
y.tail(3)

693039    0
693040    0
693042    1
Name: target, dtype: int64

In [9]:
def preprocessing_outliers(data: pd.DataFrame, g_m_int: int = 0.05, q_p_int: int = 0.95) -> pd.DataFrame:
    features = data.columns
    
    for feature in features:
        q_m = data[feature].quantile(g_m_int)
        q_p = data[feature].quantile(q_p_int)
        
        data.loc[data[feature] < q_m, feature] = q_m
        data.loc[data[feature] > q_p, feature] = q_p
        
    return data

In [10]:
X_wo_outliers = preprocessing_outliers(X)

Разделяем данные на обучающую и тестовую выборки

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_wo_outliers,
                                                    y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=23)

In [47]:
catboost_parameters = {
    'depth'         : [5, 7, 9],
    'learning_rate' : [0.05, 0.1, 0.2],
    'iterations'    : [250, 500]
}

In [48]:
grid_search_ct = GridSearchCV(estimator=CatBoostClassifier(),
                              param_grid=catboost_parameters,
                              scoring='roc_auc',
                              cv=3,
                              n_jobs=-1,
                              verbose=2)

In [13]:
grid_search_ct.fit(X_train, y_train)

0:	learn: 0.6184197	total: 560ms	remaining: 4m 39s
1:	learn: 0.5551846	total: 935ms	remaining: 3m 52s
2:	learn: 0.5005520	total: 1.8s	remaining: 4m 58s
3:	learn: 0.4533947	total: 2.19s	remaining: 4m 32s
4:	learn: 0.4124895	total: 2.7s	remaining: 4m 27s
5:	learn: 0.3763198	total: 3.13s	remaining: 4m 17s
6:	learn: 0.3466225	total: 3.58s	remaining: 4m 11s
7:	learn: 0.3195789	total: 3.85s	remaining: 3m 57s
8:	learn: 0.2966100	total: 4.34s	remaining: 3m 56s
9:	learn: 0.2776433	total: 4.75s	remaining: 3m 52s
10:	learn: 0.2608805	total: 5.05s	remaining: 3m 44s
11:	learn: 0.2460889	total: 5.29s	remaining: 3m 35s
12:	learn: 0.2334621	total: 5.6s	remaining: 3m 29s
13:	learn: 0.2227066	total: 5.86s	remaining: 3m 23s
14:	learn: 0.2137470	total: 6.17s	remaining: 3m 19s
15:	learn: 0.2057476	total: 6.47s	remaining: 3m 15s
16:	learn: 0.1984566	total: 6.76s	remaining: 3m 12s
17:	learn: 0.1919752	total: 7.04s	remaining: 3m 8s
18:	learn: 0.1865237	total: 7.39s	remaining: 3m 7s
19:	learn: 0.1817303	total:

<catboost.core.CatBoostClassifier at 0x139643700>

In [50]:
print("the best estimator:\n", grid_search_ct.best_estimator_)
print("the best score:\n", grid_search_ct.best_score_)
print("the best parameters:\n", grid_search_ct.best_params_)

the best estimator:
 <catboost.core.CatBoostClassifier object at 0x7f1e03ed7c90>
the best score:
 0.7112985128749217
the best parameters:
 {'depth': 5, 'iterations': 500, 'learning_rate': 0.05}


In [15]:
pred = grid_search_ct.predict_proba(X_test)
pred = pred[:, 1]

In [16]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary = (pred >= threshold)
    
    print("\tthreshold:", threshold)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary))
    print("\tPRECISION:", precision_score(y_test, pred_binary))
    print("\tRECALL:", recall_score(y_test, pred_binary))
    print("\tROC_AUC:", roc_auc_score(y_test, pred))
    print()

	threshold: 0.1
	F1_SCORE: 0.1977843276755768
	PRECISION: 0.158675799086758
	RECALL: 0.2624763960075533
	ROC_AUC: 0.75107262243955

	threshold: 0.3
	F1_SCORE: 0.047885888945491596
	PRECISION: 0.4292237442922374
	RECALL: 0.025357431885621798
	ROC_AUC: 0.75107262243955

	threshold: 0.5
	F1_SCORE: 0.00964630225080386
	PRECISION: 0.72
	RECALL: 0.0048556784461828975
	ROC_AUC: 0.75107262243955



In [17]:
best_threshold = 0.1

Сохраняем submission

In [24]:
X_test_submit = pd.read_parquet("../../data/input_data/test_sber.parquet")

In [25]:
print(X_test_submit.shape)

(173433, 1078)


In [26]:
X_test_submit.head()

Unnamed: 0,id,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,3,3,1696,458,26,102479,22,16,0,121,...,779,7740,9577,254,0,355,308,779,7740,9577
1,4,3,1688,53,78,103922,191,64,0,0,...,79401,109240,153820,24766,48600,46029,65113,79401,109240,153820
2,12,3,1689,13,81,104111,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,3,1761,1759,44,102433,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,3,1761,1759,77,102010,191,34,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
X_filtered_wo_outliers = preprocessing_outliers(X_test_submit)

In [28]:
pred = grid_search_ct.predict_proba(X_filtered_wo_outliers)

pred = pred[:, 1]
pred_binary = (pred >= best_threshold).astype(int)

In [31]:
submission = pd.read_csv("../../data/input_data/sample_submission.csv")
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03
3,16,1,0.03
4,20,0,0.03
5,23,0,0.03
6,26,0,0.03
7,50,0,0.03
8,51,1,0.03
9,53,0,0.03


In [32]:
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.016242
1,4,0,0.023133
2,12,0,0.036379
3,16,0,0.013805
4,20,0,0.038246
5,23,0,0.006443
6,26,0,0.016756
7,50,0,0.020308
8,51,0,0.054205
9,53,0,0.016397


In [33]:
submission.target_bin.value_counts()

target_bin
0    161537
1     11896
Name: count, dtype: int64

In [36]:
submission.to_csv(f"../../data/output_data/submission_version_{VERSION}_{best_threshold}.csv", index=False)

---