**Пятый подход (запускаем бейзлайн на отфильтрованном датасете):**

- используем отфильтрованный трейн
- фильтруем тест через csv со списком признаков после фильтрации
- запускаем predict_proba
- считаем метрики
- закидываем submission_version_5

Импортируем библиотеки

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
import warnings

warnings.simplefilter("ignore")

In [2]:
VERSION = 5

Считываем отфильтрованные данные

In [3]:
X = pd.read_parquet("../../data/intermediate_data/filter_train.parquet")

In [4]:
print(X.shape)

(519615, 228)


In [5]:
X.head()

Unnamed: 0,feature547,feature418,feature867,feature15,feature641,feature734,feature549,feature674,feature442,feature763,...,feature717,feature844,feature380,feature550,feature864,feature727,feature454,feature369,feature367,feature409
0,37,77,7063,6,0,14,17,1,35,0,...,4,1283,0,49,1619,28,0,1,4,56
1,1,77,7063,135,0,14,17,0,35,153,...,4,1283,0,49,2092,28,6,1,4,56
2,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,7174,28,83,1,3,56
3,37,77,7063,0,0,14,17,0,35,1,...,4,1283,0,49,7174,28,0,1,2,56
4,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,1439,28,1,2,2,56


In [6]:
X.target.value_counts()

target
0    501078
1     18537
Name: count, dtype: int64

In [7]:
y = X["target"]
X.drop(columns = ["target", "id"], inplace = True)

In [8]:
y.tail(3)

519612    0
519613    0
519614    1
Name: target, dtype: int64

Разделяем данные на обучающую и тестовую выборки

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=23)

In [16]:
model = CatBoostClassifier()
model.fit(X_train, y_train, verbose=False)

<catboost.core.CatBoostClassifier at 0x7fde777882d0>

In [17]:
model.fit(X_train, y_train)

Learning rate set to 0.135254
0:	learn: 0.5083591	total: 27.5ms	remaining: 27.5s
1:	learn: 0.3873893	total: 53.8ms	remaining: 26.9s
2:	learn: 0.3082160	total: 82.5ms	remaining: 27.4s
3:	learn: 0.2575567	total: 107ms	remaining: 26.7s
4:	learn: 0.2236460	total: 134ms	remaining: 26.6s
5:	learn: 0.2008500	total: 157ms	remaining: 26s
6:	learn: 0.1852374	total: 182ms	remaining: 25.8s
7:	learn: 0.1739978	total: 211ms	remaining: 26.1s
8:	learn: 0.1658332	total: 233ms	remaining: 25.6s
9:	learn: 0.1601877	total: 258ms	remaining: 25.6s
10:	learn: 0.1563971	total: 286ms	remaining: 25.7s
11:	learn: 0.1536370	total: 310ms	remaining: 25.5s
12:	learn: 0.1513712	total: 334ms	remaining: 25.4s
13:	learn: 0.1500533	total: 361ms	remaining: 25.4s
14:	learn: 0.1487958	total: 386ms	remaining: 25.4s
15:	learn: 0.1479453	total: 411ms	remaining: 25.3s
16:	learn: 0.1471797	total: 435ms	remaining: 25.2s
17:	learn: 0.1465790	total: 459ms	remaining: 25s
18:	learn: 0.1460653	total: 485ms	remaining: 25s
19:	learn: 0.1

<catboost.core.CatBoostClassifier at 0x7fde777882d0>

In [18]:
pred = model.predict_proba(X_test)
pred = pred[:, 1]

In [19]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary = (pred >= threshold)
    
    print("\tthreshold:", threshold)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary))
    print("\tPRECISION:", precision_score(y_test, pred_binary))
    print("\tRECALL:", recall_score(y_test, pred_binary))
    print("\tROC_AUC:", roc_auc_score(y_test, pred))
    print()

	threshold: 0.1
	F1_SCORE: 0.15768694844403158
	PRECISION: 0.1384301732925586
	RECALL: 0.18316698138656595
	ROC_AUC: 0.7002531585995991

	threshold: 0.3
	F1_SCORE: 0.027607361963190184
	PRECISION: 0.2634146341463415
	RECALL: 0.014567035338548692
	ROC_AUC: 0.7002531585995991

	threshold: 0.5
	F1_SCORE: 0.004291845493562232
	PRECISION: 0.38095238095238093
	RECALL: 0.002158079309414621
	ROC_AUC: 0.7002531585995991



In [20]:
best_threshold = 0.1

Сохраняем submission

In [21]:
X_test_submit = pd.read_parquet("../../data/input_data/test_sber.parquet")

In [22]:
print(X_test_submit.shape)

(173433, 1078)


In [23]:
X_test_submit.head()

Unnamed: 0,id,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,3,3,1696,458,26,102479,22,16,0,121,...,779,7740,9577,254,0,355,308,779,7740,9577
1,4,3,1688,53,78,103922,191,64,0,0,...,79401,109240,153820,24766,48600,46029,65113,79401,109240,153820
2,12,3,1689,13,81,104111,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,3,1761,1759,44,102433,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,3,1761,1759,77,102010,191,34,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
filtered_columns = pd.read_csv("../../data/intermediate_data/cols_after_preprocessing.csv")

In [25]:
cols = list(set(filtered_columns.cols_after_preprocessing.values) & set(X_test_submit.columns))

In [26]:
X_filtered = X_test_submit[cols]

In [27]:
X_filtered.drop(columns=["id"], inplace=True)

In [28]:
print(X_filtered.shape)

(173433, 226)


In [29]:
X_filtered.head()

Unnamed: 0,feature452,feature230,feature987,feature857,feature543,feature376,feature557,feature507,feature503,feature552,...,feature522,feature319,feature442,feature424,feature446,feature541,feature743,feature812,feature491,feature836
0,14,0,0,23,5,0,28,146,0,23,...,0,7854,35,110,6,2,23,1664,6,1289
1,11,0,0,1605,5,3,28,1,2,23,...,91,5054,35,110,2,1,23,1656,7,1289
2,54,0,0,1605,2,1,28,15,8,23,...,91,6450,35,110,15,0,23,1657,6,1289
3,63,0,0,1605,2,0,28,1,101,23,...,91,10388,0,110,14,50,23,1729,0,1289
4,112,0,0,1605,2,0,28,18,4,23,...,91,5303,35,110,100,0,23,1729,12,1289


In [31]:
pred = model.predict_proba(X_filtered)

pred = pred[:, 1]
pred_binary = (pred >= best_threshold).astype(int)

In [32]:
submission = pd.read_csv("../../data/intermediate_data/sample_submission.csv")
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03
3,16,1,0.03
4,20,0,0.03
5,23,0,0.03
6,26,0,0.03
7,50,0,0.03
8,51,1,0.03
9,53,0,0.03


In [33]:
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.023142
1,4,0,0.010692
2,12,0,0.013866
3,16,0,0.019074
4,20,0,0.058245
5,23,0,0.01124
6,26,0,0.028467
7,50,0,0.033208
8,51,0,0.049913
9,53,0,0.014366


In [34]:
submission.target_bin.value_counts()

target_bin
0    162754
1     10679
Name: count, dtype: int64

In [35]:
submission.to_csv(f"../../data/output_data/submission_version_{VERSION}_{best_threshold}.csv", index=False)

---