**10 подход (запускаем гридсерч на отфильтрованном датасете + катег):**

- используем отфильтрованный трейн
- columntransformer
- фильтруем тест через csv со списком признаков после фильтрации
- запускаем predict_proba
- считаем метрики
- закидываем submission_version_10

Импортируем библиотеки

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings

warnings.simplefilter("ignore")

In [2]:
VERSION = 10

Считываем отфильтрованные данные

In [3]:
X = pd.read_parquet("../../data/intermediate_data/filter_train.parquet")

In [4]:
print(X.shape)

(519615, 228)


In [5]:
X.head()

Unnamed: 0,feature547,feature418,feature867,feature15,feature641,feature734,feature549,feature674,feature442,feature763,...,feature717,feature844,feature380,feature550,feature864,feature727,feature454,feature369,feature367,feature409
0,37,77,7063,6,0,14,17,1,35,0,...,4,1283,0,49,1619,28,0,1,4,56
1,1,77,7063,135,0,14,17,0,35,153,...,4,1283,0,49,2092,28,6,1,4,56
2,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,7174,28,83,1,3,56
3,37,77,7063,0,0,14,17,0,35,1,...,4,1283,0,49,7174,28,0,1,2,56
4,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,1439,28,1,2,2,56


In [6]:
X.target.value_counts()

target
0    501078
1     18537
Name: count, dtype: int64

In [7]:
y = X["target"]
X.drop(columns = ["target", "id"], inplace = True)

In [8]:
y.tail(3)

519612    0
519613    0
519614    1
Name: target, dtype: int64

In [9]:
unique_dict = {
    col: {
        'vals': X[col].unique(),
        'count': len(X[col].unique()),
        'high_cardinality': True if len(X[col].unique()) > 0.01 * len(X) else False
    }
    for col in X.columns
}

In [10]:
num_cols = [col for col in X.columns if unique_dict[col]['high_cardinality']]
cat_cols = [col for col in X.columns if not unique_dict[col]['high_cardinality']]

In [11]:
len(num_cols), len(cat_cols)

(31, 195)

Разделяем данные на обучающую и тестовую выборки

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=23)

In [17]:
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('sc', StandardScaler(), num_cols)
])

pipeline = Pipeline(steps=[
    ('col_transf', column_transformer),
    ('model', CatBoostClassifier())
])

In [18]:
catboost_parameters = {
    'model__depth'         : [5, 7, 9],
    'model__learning_rate' : [0.05, 0.1, 0.2],
    'model__iterations'    : [250, 500]
}

In [19]:
grid_search_ct = GridSearchCV(estimator=pipeline,
                              param_grid=catboost_parameters,
                              scoring='roc_auc',
                              cv=3,
                              n_jobs=5,
                              verbose=2)

In [20]:
grid_search_ct.fit(X_train, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
0:	learn: 0.5533719	total: 605ms	remaining: 2m 30s
1:	learn: 0.4504249	total: 1.2s	remaining: 2m 28s
2:	learn: 0.3748978	total: 1.68s	remaining: 2m 18s
3:	learn: 0.3184197	total: 1.79s	remaining: 1m 50s
4:	learn: 0.2766897	total: 1.88s	remaining: 1m 32s
5:	learn: 0.2457464	total: 1.98s	remaining: 1m 20s
6:	learn: 0.2224200	total: 2.08s	remaining: 1m 12s
7:	learn: 0.2054164	total: 2.19s	remaining: 1m 6s
8:	learn: 0.1928334	total: 2.29s	remaining: 1m 1s
9:	learn: 0.1824412	total: 2.4s	remaining: 57.6s
10:	learn: 0.1751199	total: 2.49s	remaining: 54.2s
11:	learn: 0.1690981	total: 2.61s	remaining: 51.9s
12:	learn: 0.1643649	total: 2.72s	remaining: 49.5s
13:	learn: 0.1607315	total: 2.84s	remaining: 47.9s
14:	learn: 0.1578325	total: 2.95s	remaining: 46.2s
15:	learn: 0.1554689	total: 3.04s	remaining: 44.5s
16:	learn: 0.1536509	total: 3.16s	remaining: 43.3s
17:	learn: 0.1521422	total: 3.26s	remaining: 42.1s
18:	learn: 0.1508358	total

In [21]:
pred = grid_search_ct.predict_proba(X_test)
pred = pred[:, 1]

In [22]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary = (pred >= threshold)
    
    print("\tthreshold:", threshold)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary))
    print("\tPRECISION:", precision_score(y_test, pred_binary))
    print("\tRECALL:", recall_score(y_test, pred_binary))
    print("\tROC_AUC:", roc_auc_score(y_test, pred))
    print()

	threshold: 0.1
	F1_SCORE: 0.15231868264923115
	PRECISION: 0.13818101933216168
	RECALL: 0.16967898570272458
	ROC_AUC: 0.6992668805436906

	threshold: 0.3
	F1_SCORE: 0.013137151865475564
	PRECISION: 0.25252525252525254
	RECALL: 0.006743997841920691
	ROC_AUC: 0.6992668805436906

	threshold: 0.5
	F1_SCORE: 0.0010755579456843238
	PRECISION: 0.16666666666666666
	RECALL: 0.0005395198273536552
	ROC_AUC: 0.6992668805436906



In [56]:
best_threshold = 0.1

Сохраняем submission

In [57]:
X_test_submit = pd.read_parquet("../../data/input_data/test_sber.parquet")

In [58]:
print(X_test_submit.shape)

(173433, 1078)


In [59]:
X_test_submit.head()

Unnamed: 0,id,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,3,3,1696,458,26,102479,22,16,0,121,...,779,7740,9577,254,0,355,308,779,7740,9577
1,4,3,1688,53,78,103922,191,64,0,0,...,79401,109240,153820,24766,48600,46029,65113,79401,109240,153820
2,12,3,1689,13,81,104111,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,3,1761,1759,44,102433,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,3,1761,1759,77,102010,191,34,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
filtered_columns = pd.read_csv("../../data/intermediate_data/cols_after_preprocessing.csv")

In [61]:
cols = list(set(filtered_columns.cols_after_preprocessing.values) & set(X_test_submit.columns))

In [62]:
X_filtered = X_test_submit[cols]

In [63]:
X_filtered.drop(columns=["id"], inplace=True)

In [64]:
print(X_filtered.shape)

(173433, 226)


In [65]:
X_filtered.head()

Unnamed: 0,feature56,feature408,feature142,feature255,feature725,feature750,feature556,feature551,feature781,feature610,...,feature22,feature530,feature516,feature541,feature611,feature410,feature760,feature447,feature779,feature558
0,0,13,246528,11963,14,4,13,0,79,14,...,119624,82,0,2,5,0,0,0,39,20
1,0,13,220678,0,14,6,13,1,79,5,...,140184,161,0,1,3,21,12,49,39,20
2,0,13,579624,0,0,10,13,1,79,26,...,140184,246,113,0,3,21,2,49,39,20
3,0,13,294469,0,14,3,13,1,79,1,...,140184,274,2,50,0,21,0,49,39,20
4,0,13,118000,0,14,3,13,1,79,5,...,140184,363,0,0,0,21,0,0,39,20


In [67]:
pred = grid_search_ct.predict_proba(X_filtered)

pred = pred[:, 1]
pred_binary = (pred >= best_threshold).astype(int)

In [68]:
submission = pd.read_csv("../../data/intermediate_data/sample_submission.csv")
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03
3,16,1,0.03
4,20,0,0.03
5,23,0,0.03
6,26,0,0.03
7,50,0,0.03
8,51,1,0.03
9,53,0,0.03


In [69]:
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.043726
1,4,0,0.016954
2,12,1,0.133978
3,16,0,0.017859
4,20,0,0.050296
5,23,0,0.01412
6,26,0,0.018392
7,50,0,0.036499
8,51,0,0.04084
9,53,0,0.019699


In [70]:
submission.target_bin.value_counts()

target_bin
0    162946
1     10487
Name: count, dtype: int64

In [40]:
submission.to_csv(f"../../data/output_data/submission_version_{VERSION}_{best_threshold}.csv", index=False)

---