**Девятый подход (запускаем бейзлайн на отфильтрованном датасете + катег):**

- используем отфильтрованный трейн
- columntransformer
- фильтруем тест через csv со списком признаков после фильтрации
- запускаем predict_proba
- считаем метрики
- закидываем submission_version_9

Импортируем библиотеки

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings

warnings.simplefilter("ignore")

In [2]:
VERSION = 9

Считываем отфильтрованные данные

In [3]:
X = pd.read_parquet("../../data/intermediate_data/filter_train.parquet")

In [4]:
print(X.shape)

(519615, 228)


In [5]:
X.head()

Unnamed: 0,feature547,feature418,feature867,feature15,feature641,feature734,feature549,feature674,feature442,feature763,...,feature717,feature844,feature380,feature550,feature864,feature727,feature454,feature369,feature367,feature409
0,37,77,7063,6,0,14,17,1,35,0,...,4,1283,0,49,1619,28,0,1,4,56
1,1,77,7063,135,0,14,17,0,35,153,...,4,1283,0,49,2092,28,6,1,4,56
2,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,7174,28,83,1,3,56
3,37,77,7063,0,0,14,17,0,35,1,...,4,1283,0,49,7174,28,0,1,2,56
4,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,1439,28,1,2,2,56


In [6]:
X.target.value_counts()

target
0    501078
1     18537
Name: count, dtype: int64

In [7]:
y = X["target"]
X.drop(columns = ["target", "id"], inplace = True)

In [8]:
y.tail(3)

519612    0
519613    0
519614    1
Name: target, dtype: int64

In [9]:
unique_dict = {
    col: {
        'vals': X[col].unique(),
        'count': len(X[col].unique()),
        'high_cardinality': True if len(X[col].unique()) > 0.01 * len(X) else False
    }
    for col in X.columns
}

In [11]:
num_cols = [col for col in X.columns if unique_dict[col]['high_cardinality']]
cat_cols = [col for col in X.columns if not unique_dict[col]['high_cardinality']]

In [14]:
len(num_cols), len(cat_cols)

(31, 195)

Разделяем данные на обучающую и тестовую выборки

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=23)

In [17]:
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('sc', StandardScaler(), num_cols)
])

pipeline = Pipeline(steps=[
    ('col_transf', column_transformer),
    ('model', CatBoostClassifier())
])

In [19]:
model = pipeline.fit(X_train, y_train)

Learning rate set to 0.135254
0:	learn: 0.5092706	total: 429ms	remaining: 7m 8s
1:	learn: 0.3900932	total: 836ms	remaining: 6m 56s
2:	learn: 0.3102304	total: 1.1s	remaining: 6m 6s
3:	learn: 0.2584198	total: 1.39s	remaining: 5m 47s
4:	learn: 0.2240157	total: 1.75s	remaining: 5m 48s
5:	learn: 0.2005698	total: 2.09s	remaining: 5m 46s
6:	learn: 0.1858711	total: 2.33s	remaining: 5m 29s
7:	learn: 0.1745446	total: 2.64s	remaining: 5m 27s
8:	learn: 0.1665959	total: 2.93s	remaining: 5m 22s
9:	learn: 0.1608528	total: 3.27s	remaining: 5m 23s
10:	learn: 0.1572013	total: 3.5s	remaining: 5m 14s
11:	learn: 0.1540902	total: 3.79s	remaining: 5m 12s
12:	learn: 0.1518398	total: 4.03s	remaining: 5m 5s
13:	learn: 0.1502830	total: 4.41s	remaining: 5m 10s
14:	learn: 0.1490815	total: 4.72s	remaining: 5m 10s
15:	learn: 0.1482022	total: 5.03s	remaining: 5m 9s
16:	learn: 0.1474431	total: 5.34s	remaining: 5m 8s
17:	learn: 0.1468205	total: 5.63s	remaining: 5m 7s
18:	learn: 0.1463944	total: 5.91s	remaining: 5m 5s
1

In [20]:
pred = model.predict_proba(X_test)
pred = pred[:, 1]

In [21]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary = (pred >= threshold)
    
    print("\tthreshold:", threshold)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary))
    print("\tPRECISION:", precision_score(y_test, pred_binary))
    print("\tRECALL:", recall_score(y_test, pred_binary))
    print("\tROC_AUC:", roc_auc_score(y_test, pred))
    print()

	threshold: 0.1
	F1_SCORE: 0.1460729141179349
	PRECISION: 0.13364674278038952
	RECALL: 0.1610466684650661
	ROC_AUC: 0.6989710480016522

	threshold: 0.3
	F1_SCORE: 0.011674184133722473
	PRECISION: 0.3548387096774194
	RECALL: 0.005934718100890208
	ROC_AUC: 0.6989710480016522

	threshold: 0.5
	F1_SCORE: 0.0010770059235325794
	PRECISION: 0.2857142857142857
	RECALL: 0.0005395198273536552
	ROC_AUC: 0.6989710480016522



In [22]:
best_threshold = 0.1

Сохраняем submission

In [23]:
X_test_submit = pd.read_parquet("../../data/input_data/test_sber.parquet")

In [24]:
print(X_test_submit.shape)

(173433, 1078)


In [25]:
X_test_submit.head()

Unnamed: 0,id,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,3,3,1696,458,26,102479,22,16,0,121,...,779,7740,9577,254,0,355,308,779,7740,9577
1,4,3,1688,53,78,103922,191,64,0,0,...,79401,109240,153820,24766,48600,46029,65113,79401,109240,153820
2,12,3,1689,13,81,104111,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,3,1761,1759,44,102433,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,3,1761,1759,77,102010,191,34,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
filtered_columns = pd.read_csv("../../data/intermediate_data/cols_after_preprocessing.csv")

In [27]:
cols = list(set(filtered_columns.cols_after_preprocessing.values) & set(X_test_submit.columns))

In [28]:
X_filtered = X_test_submit[cols]

In [29]:
X_filtered.drop(columns=["id"], inplace=True)

In [30]:
print(X_filtered.shape)

(173433, 226)


In [31]:
X_filtered.head()

Unnamed: 0,feature25,feature198,feature771,feature552,feature227,feature521,feature193,feature600,feature369,feature187,...,feature398,feature422,feature373,feature869,feature781,feature551,feature604,feature90,feature337,feature455
0,1638,28680,20,23,0,0,22144,8,3,118483,...,1,56,1,45,79,0,4,78982,1332,0
1,300,202554,20,23,0,63,183215,2,1,412233,...,0,56,0,34,79,1,3,0,1332,7
2,632,342741,20,23,0,63,337785,23,1,340254,...,0,56,0,27,79,1,3,0,1332,0
3,1722,402837,20,23,0,63,405690,1,2,245949,...,0,56,0,4121,79,1,0,0,1332,278
4,1722,88432,20,23,0,63,72170,0,1,86423,...,0,56,0,2200,79,1,0,165865,1332,1


In [33]:
pred = model.predict_proba(X_filtered)

pred = pred[:, 1]
pred_binary = (pred >= best_threshold).astype(int)

In [34]:
submission = pd.read_csv("../../data/intermediate_data/sample_submission.csv")
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03
3,16,1,0.03
4,20,0,0.03
5,23,0,0.03
6,26,0,0.03
7,50,0,0.03
8,51,1,0.03
9,53,0,0.03


In [35]:
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.032859
1,4,0,0.018334
2,12,1,0.127922
3,16,0,0.017962
4,20,0,0.054552
5,23,0,0.013349
6,26,0,0.022758
7,50,0,0.034197
8,51,0,0.057815
9,53,0,0.01672


In [36]:
submission.target_bin.value_counts()

target_bin
0    162904
1     10529
Name: count, dtype: int64

In [37]:
submission.to_csv(f"../../data/output_data/submission_version_{VERSION}_{best_threshold}.csv", index=False)

---