**Третий подход (добавляем работу с выбросами):**

- используем отфильтрованный трейн
- делаем oversampling при помощи SMOTE
- запускаем gridsearch для catboost c cv=3 и scoring='roc_auc'
- фильтруем тест через csv со списком признаков после фильтрации
- запускаем predict_proba
- считаем метрики
- закидываем submission_version_3

Импортируем библиотеки

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import ADASYN
import warnings

warnings.simplefilter("ignore")

In [2]:
VERSION = 3

Считываем отфильтрованные данные

In [3]:
X = pd.read_parquet("../../data/intermediate_data/filter_train.parquet")

In [4]:
print(X.shape)

(519615, 228)


In [5]:
X.head()

Unnamed: 0,feature547,feature418,feature867,feature15,feature641,feature734,feature549,feature674,feature442,feature763,...,feature717,feature844,feature380,feature550,feature864,feature727,feature454,feature369,feature367,feature409
0,37,77,7063,6,0,14,17,1,35,0,...,4,1283,0,49,1619,28,0,1,4,56
1,1,77,7063,135,0,14,17,0,35,153,...,4,1283,0,49,2092,28,6,1,4,56
2,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,7174,28,83,1,3,56
3,37,77,7063,0,0,14,17,0,35,1,...,4,1283,0,49,7174,28,0,1,2,56
4,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,1439,28,1,2,2,56


In [6]:
X.target.value_counts()

target
0    501078
1     18537
Name: count, dtype: int64

In [7]:
y = X["target"]
X.drop(columns = ["target", "id"], inplace = True)

In [8]:
y.tail(3)

519612    0
519613    0
519614    1
Name: target, dtype: int64

In [10]:
def preprocessing_outliers(data: pd.DataFrame, g_m_int: int = 0.05, q_p_int: int = 0.95) -> pd.DataFrame:
    features = data.columns
    
    for feature in features:
        q_m = data[feature].quantile(g_m_int)
        q_p = data[feature].quantile(q_p_int)
        
        data.loc[data[feature] < q_m, feature] = q_m
        data.loc[data[feature] > q_p, feature] = q_p
        
    return data

In [11]:
X_wo_outliers = preprocessing_outliers(X)

Делаем oversampling через ADASYN

In [12]:
X_resampled, y_resampled = ADASYN().fit_resample(X_wo_outliers, y)

In [13]:
X_resampled.shape

(994992, 226)

Создаем сетку параметров для CatBoost и запускаем перебор параметров с CV

In [16]:
catboost_parameters = {
    'depth'         : [4, 7, 10],
    'learning_rate' : [0.01, 0.03, 0.05],
    'iterations'    : [50, 250]
}

Разделяем данные на обучающую и тестовую выборки

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                    y_resampled, 
                                                    test_size=0.2, 
                                                    stratify=y_resampled, 
                                                    random_state=23)

In [18]:
grid_search_ct = GridSearchCV(estimator=CatBoostClassifier(),
                              param_grid=catboost_parameters,
                              scoring='roc_auc',
                              cv=3,
                              n_jobs=5,
                              verbose=2)

In [19]:
grid_search_ct.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
0:	learn: 0.6872885	total: 126ms	remaining: 6.19s
1:	learn: 0.6826459	total: 224ms	remaining: 5.38s
2:	learn: 0.6770568	total: 383ms	remaining: 6s
3:	learn: 0.6716582	total: 438ms	remaining: 5.04s
4:	learn: 0.6664168	total: 493ms	remaining: 4.44s
5:	learn: 0.6597884	total: 537ms	remaining: 3.94s
6:	learn: 0.6548256	total: 627ms	remaining: 3.85s
7:	learn: 0.6508643	total: 746ms	remaining: 3.92s
8:	learn: 0.6470401	total: 825ms	remaining: 3.76s
9:	learn: 0.6424152	total: 892ms	remaining: 3.57s
10:	learn: 0.6363677	total: 1.01s	remaining: 3.59s
11:	learn: 0.6328018	total: 1.09s	remaining: 3.47s
12:	learn: 0.6292065	total: 1.2s	remaining: 3.41s
13:	learn: 0.6257577	total: 1.34s	remaining: 3.46s
14:	learn: 0.6225415	total: 1.49s	remaining: 3.48s
15:	learn: 0.6184638	total: 1.62s	remaining: 3.45s
16:	learn: 0.6145262	total: 1.77s	remaining: 3.43s
17:	learn: 0.6113670	total: 1.86s	remaining: 3.31s
18:	learn: 0.6051017	total: 2.01s	r

In [20]:
print("the best estimator:\n", grid_search_ct.best_estimator_)
print("the best score:\n", grid_search_ct.best_score_)
print("the best parameters:\n", grid_search_ct.best_params_)

the best estimator:
 <catboost.core.CatBoostClassifier object at 0x7f967480d210>
the best score:
 0.988209569572483
the best parameters:
 {'depth': 10, 'iterations': 250, 'learning_rate': 0.05}


In [14]:
grid_search_ct = CatBoostClassifier(depth=10, iterations=250, learning_rate=0.5)

In [17]:
grid_search_ct.fit(X_train, y_train)

0:	learn: 0.3874527	total: 174ms	remaining: 43.2s
1:	learn: 0.2618933	total: 295ms	remaining: 36.6s
2:	learn: 0.1881824	total: 412ms	remaining: 34s
3:	learn: 0.1565982	total: 532ms	remaining: 32.7s
4:	learn: 0.1351437	total: 648ms	remaining: 31.7s
5:	learn: 0.1236702	total: 768ms	remaining: 31.2s
6:	learn: 0.1161152	total: 889ms	remaining: 30.9s
7:	learn: 0.1106142	total: 1s	remaining: 30.4s
8:	learn: 0.1034607	total: 1.13s	remaining: 30.2s
9:	learn: 0.0988536	total: 1.25s	remaining: 30.1s
10:	learn: 0.0955264	total: 1.39s	remaining: 30.2s
11:	learn: 0.0925787	total: 1.52s	remaining: 30.2s
12:	learn: 0.0913333	total: 1.64s	remaining: 30s
13:	learn: 0.0900946	total: 1.77s	remaining: 29.8s
14:	learn: 0.0880614	total: 1.88s	remaining: 29.5s
15:	learn: 0.0870740	total: 2.01s	remaining: 29.4s
16:	learn: 0.0863077	total: 2.13s	remaining: 29.3s
17:	learn: 0.0854537	total: 2.25s	remaining: 29.1s
18:	learn: 0.0847856	total: 2.38s	remaining: 28.9s
19:	learn: 0.0842377	total: 2.5s	remaining: 28.7

<catboost.core.CatBoostClassifier at 0x7f634002b710>

In [18]:
pred = grid_search_ct.predict_proba(X_test)
pred = pred[:, 1]

In [19]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary = (pred >= threshold)
    
    print("threshold:", threshold)
    print("F1_SCORE:", f1_score(y_test, pred_binary))
    print("PRECISION:", precision_score(y_test, pred_binary))
    print("RECALL:", recall_score(y_test, pred_binary))
    print("ROC_AUC:", roc_auc_score(y_test, pred))
    print()

threshold: 0.1
F1_SCORE: 0.9539681352382378
PRECISION: 0.9400503184212595
RECALL: 0.9683042628792403
ROC_AUC: 0.9855298281999816

threshold: 0.3
F1_SCORE: 0.9759620312852368
PRECISION: 0.9884142226836231
RECALL: 0.9638196855734287
ROC_AUC: 0.9855298281999816

threshold: 0.5
F1_SCORE: 0.9791947760348539
PRECISION: 0.9965618088239918
RECALL: 0.9624226840650719
ROC_AUC: 0.9855298281999816



In [36]:
best_threshold = 0.1

Сохраняем submission

In [37]:
X_test_submit = pd.read_parquet("../../data/input_data/test_sber.parquet")
#X.drop(columns = ["sample_ml_new", "id"], inplace = True)

In [38]:
print(X_test_submit.shape)

(173433, 1078)


In [39]:
X_test_submit.head()

Unnamed: 0,id,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,3,3,1696,458,26,102479,22,16,0,121,...,779,7740,9577,254,0,355,308,779,7740,9577
1,4,3,1688,53,78,103922,191,64,0,0,...,79401,109240,153820,24766,48600,46029,65113,79401,109240,153820
2,12,3,1689,13,81,104111,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,3,1761,1759,44,102433,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,3,1761,1759,77,102010,191,34,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
filtered_columns = pd.read_csv("../../data/intermediate_data/cols_after_preprocessing.csv")

In [41]:
cols = list(set(filtered_columns.cols_after_preprocessing.values) & set(X_test_submit.columns))

In [42]:
X_filtered = X_test_submit[cols]

In [43]:
X_filtered.drop(columns=["id"], inplace=True)

In [44]:
print(X_filtered.shape)

(173433, 226)


In [45]:
X_filtered.head()

Unnamed: 0,feature751,feature576,feature553,feature725,feature528,feature556,feature498,feature824,feature987,feature457,...,feature504,feature446,feature777,feature497,feature440,feature198,feature844,feature145,feature188,feature757
0,3,8,16,14,0,13,17,1278,0,129,...,3,6,0,14,217,28680,1283,21059,92684,0
1,5,0,10,14,5,13,17,1278,0,129,...,20,2,0,5,0,202554,1283,51376,303793,0
2,24,8,20,0,3,13,17,1278,0,129,...,0,15,2,26,2,342741,139,212692,400876,1
3,17,8,4,14,237,13,17,1278,0,129,...,30,14,1,20,217,402837,1283,46471,146653,0
4,11,8,6,14,0,13,17,1278,0,129,...,10,100,0,11,217,88432,1283,35981,139526,0


In [46]:
X_filtered_wo_outliers = preprocessing_outliers(X_filtered)

In [47]:
pred = grid_search_ct.predict_proba(X_filtered_wo_outliers)

pred = pred[:, 1]
pred_binary = (pred >= best_threshold).astype(int)

In [48]:
submission = pd.read_csv("../../data/intermediate_data/sample_submission.csv")
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03
3,16,1,0.03
4,20,0,0.03
5,23,0,0.03
6,26,0,0.03
7,50,0,0.03
8,51,1,0.03
9,53,0,0.03


In [49]:
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.020471
1,4,0,0.044858
2,12,0,0.013405
3,16,0,0.006872
4,20,0,0.003861
5,23,0,0.008023
6,26,0,0.016584
7,50,1,0.279655
8,51,0,0.067735
9,53,0,0.022439


In [50]:
submission.target_bin.value_counts()

target_bin
0    158015
1     15418
Name: count, dtype: int64

In [51]:
submission.to_csv(f"../../data/output_data/submission_version_{VERSION}_{best_threshold}.csv", index=False)

---