**11 подход (запускаем гридсерч на отфильтрованном датасете + катег + выбросы):**

- используем отфильтрованный трейн
- columntransformer
- фильтруем тест через csv со списком признаков после фильтрации 
- запускаем predict_proba
- считаем метрики
- закидываем submission_version_11

Импортируем библиотеки

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings

warnings.simplefilter("ignore")

In [2]:
VERSION = 11

Считываем отфильтрованные данные

In [3]:
X = pd.read_parquet("../../data/intermediate_data/filter_train.parquet")

In [4]:
print(X.shape)

(519615, 228)


In [5]:
X.head()

Unnamed: 0,feature547,feature418,feature867,feature15,feature641,feature734,feature549,feature674,feature442,feature763,...,feature717,feature844,feature380,feature550,feature864,feature727,feature454,feature369,feature367,feature409
0,37,77,7063,6,0,14,17,1,35,0,...,4,1283,0,49,1619,28,0,1,4,56
1,1,77,7063,135,0,14,17,0,35,153,...,4,1283,0,49,2092,28,6,1,4,56
2,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,7174,28,83,1,3,56
3,37,77,7063,0,0,14,17,0,35,1,...,4,1283,0,49,7174,28,0,1,2,56
4,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,1439,28,1,2,2,56


In [6]:
X.target.value_counts()

target
0    501078
1     18537
Name: count, dtype: int64

In [7]:
y = X["target"]
X.drop(columns = ["target", "id"], inplace = True)

In [8]:
y.tail(3)

519612    0
519613    0
519614    1
Name: target, dtype: int64

In [9]:
unique_dict = {
    col: {
        'vals': X[col].unique(),
        'count': len(X[col].unique()),
        'high_cardinality': True if len(X[col].unique()) > 0.01 * len(X) else False
    }
    for col in X.columns
}

In [10]:
num_cols = [col for col in X.columns if unique_dict[col]['high_cardinality']]
cat_cols = [col for col in X.columns if not unique_dict[col]['high_cardinality']]

In [11]:
len(num_cols), len(cat_cols)

(31, 195)

In [12]:
def preprocessing_outliers(data: pd.DataFrame, 
                           features: list, 
                           g_m_int: int = 0.05, 
                           q_p_int: int = 0.95) -> pd.DataFrame:
    
    for feature in features:
        q_m = data[feature].quantile(g_m_int)
        q_p = data[feature].quantile(q_p_int)
        
        data.loc[data[feature] < q_m, feature] = q_m
        data.loc[data[feature] > q_p, feature] = q_p
        
    return data

In [13]:
X_wo_outliers = preprocessing_outliers(X, num_cols)

Разделяем данные на обучающую и тестовую выборки

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X_wo_outliers,
                                                    y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=23)

In [33]:
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('sc', StandardScaler(), num_cols)
])

pipeline = Pipeline(steps=[
    ('col_transf', column_transformer),
    ('model', CatBoostClassifier())
])

In [None]:
catboost_parameters = {
    'model__depth'         : [5, 7, 9],
    'model__learning_rate' : [0.05, 0.1, 0.2],
    'model__iterations'    : [250, 500]
}

In [36]:
grid_search_ct = GridSearchCV(estimator=pipeline,
                              param_grid=catboost_parameters,
                              scoring='roc_auc',
                              cv=3,
                              n_jobs=5,
                              verbose=2)

In [37]:
grid_search_ct.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
466:	learn: 0.1366929	total: 2m 39s	remaining: 11.3s
467:	learn: 0.1366874	total: 2m 39s	remaining: 10.9s
468:	learn: 0.1366819	total: 2m 40s	remaining: 10.6s
469:	learn: 0.1366563	total: 2m 40s	remaining: 10.2s
470:	learn: 0.1366508	total: 2m 40s	remaining: 9.89s
471:	learn: 0.1366454	total: 2m 41s	remaining: 9.55s
472:	learn: 0.1366401	total: 2m 41s	remaining: 9.21s
473:	learn: 0.1366190	total: 2m 41s	remaining: 8.86s
474:	learn: 0.1366017	total: 2m 41s	remaining: 8.53s
475:	learn: 0.1365966	total: 2m 42s	remaining: 8.18s
476:	learn: 0.1365913	total: 2m 42s	remaining: 7.84s
477:	learn: 0.1365858	total: 2m 42s	remaining: 7.5s
478:	learn: 0.1365576	total: 2m 43s	remaining: 7.16s
479:	learn: 0.1365524	total: 2m 43s	remaining: 6.82s
480:	learn: 0.1365470	total: 2m 43s	remaining: 6.47s
481:	learn: 0.1365416	total: 2m 44s	remaining: 6.14s
482:	learn: 0.1365363	total: 2m 44s	remaining: 5.79s
483:	learn: 0.1365310	total: 2m 45s	remai

In [38]:
grid_search_ct.best_params_

{'model__depth': 5, 'model__iterations': 500, 'model__learning_rate': 0.05}

In [39]:
pred = grid_search_ct.predict_proba(X_test)
pred = pred[:, 1]

In [40]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary = (pred >= threshold)
    
    print("\tthreshold:", threshold)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary))
    print("\tPRECISION:", precision_score(y_test, pred_binary))
    print("\tRECALL:", recall_score(y_test, pred_binary))
    print("\tROC_AUC:", roc_auc_score(y_test, pred))
    print()

	threshold: 0.1
	F1_SCORE: 0.14492753623188406
	PRECISION: 0.1404352226720648
	RECALL: 0.14971675209063934
	ROC_AUC: 0.6948636857525055

	threshold: 0.3
	F1_SCORE: 0.002688172043010753
	PRECISION: 0.38461538461538464
	RECALL: 0.001348799568384138
	ROC_AUC: 0.6948636857525055

	threshold: 0.5
	F1_SCORE: 0.0
	PRECISION: 0.0
	RECALL: 0.0
	ROC_AUC: 0.6948636857525055



In [41]:
best_threshold = 0.1

Сохраняем submission

In [42]:
X_test_submit = pd.read_parquet("../../data/input_data/test_sber.parquet")

In [43]:
print(X_test_submit.shape)

(173433, 1078)


In [44]:
X_test_submit.head()

Unnamed: 0,id,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,3,3,1696,458,26,102479,22,16,0,121,...,779,7740,9577,254,0,355,308,779,7740,9577
1,4,3,1688,53,78,103922,191,64,0,0,...,79401,109240,153820,24766,48600,46029,65113,79401,109240,153820
2,12,3,1689,13,81,104111,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,3,1761,1759,44,102433,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,3,1761,1759,77,102010,191,34,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
filtered_columns = pd.read_csv("../../data/intermediate_data/cols_after_preprocessing.csv")

In [46]:
cols = list(set(filtered_columns.cols_after_preprocessing.values) & set(X_test_submit.columns))

In [47]:
X_filtered = X_test_submit[cols]

In [48]:
X_filtered.drop(columns=["id"], inplace=True)

In [49]:
print(X_filtered.shape)

(173433, 226)


In [50]:
X_filtered.head()

Unnamed: 0,feature771,feature781,feature362,feature867,feature600,feature23,feature783,feature25,feature410,feature453,...,feature239,feature516,feature868,feature641,feature813,feature452,feature612,feature267,feature369,feature460
0,20,79,18,7063,8,9,72,1638,0,64,...,0,0,3709,0,31,14,5,0,3,43
1,20,79,18,7063,2,9,72,300,21,64,...,0,0,1980,0,31,11,3,0,1,1
2,20,79,18,7063,23,9,72,632,21,9,...,0,113,3821,0,13,54,3,0,1,43
3,20,79,18,7063,1,9,72,1722,21,64,...,0,2,4122,0,31,63,0,0,2,43
4,20,79,18,1017,0,9,72,1722,21,3,...,0,0,2540,0,31,112,0,0,1,43


In [51]:
X_filtered_wo_outliers = preprocessing_outliers(X_filtered, cat_cols)

In [52]:
pred = grid_search_ct.predict_proba(X_filtered_wo_outliers)

pred = pred[:, 1]
pred_binary = (pred >= best_threshold).astype(int)

In [53]:
submission = pd.read_csv("../../data/intermediate_data/sample_submission.csv")
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03
3,16,1,0.03
4,20,0,0.03
5,23,0,0.03
6,26,0,0.03
7,50,0,0.03
8,51,1,0.03
9,53,0,0.03


In [54]:
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.043389
1,4,0,0.020123
2,12,0,0.021296
3,16,0,0.019821
4,20,0,0.060315
5,23,0,0.016052
6,26,0,0.021885
7,50,0,0.038243
8,51,0,0.04906
9,53,0,0.019651


In [55]:
submission.target_bin.value_counts()

target_bin
0    165046
1      8387
Name: count, dtype: int64

In [56]:
submission.to_csv(f"../../data/output_data/submission_version_{VERSION}_{best_threshold}.csv", index=False)

---