**12 подход (запускаем catboost и xgboost на отфильтрованном датасете + катег):**

- используем отфильтрованный трейн
- columntransformer
- фильтруем тест через csv со списком признаков после фильтрации 
- запускаем predict_proba
- считаем метрики
- закидываем submission_version_12

Импортируем библиотеки

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings

warnings.simplefilter("ignore")

In [2]:
VERSION = 12

Считываем отфильтрованные данные

In [3]:
X = pd.read_parquet("../../data/intermediate_data/filter_train.parquet")

In [4]:
print(X.shape)

(519615, 228)


In [5]:
X.head()

Unnamed: 0,feature547,feature418,feature867,feature15,feature641,feature734,feature549,feature674,feature442,feature763,...,feature717,feature844,feature380,feature550,feature864,feature727,feature454,feature369,feature367,feature409
0,37,77,7063,6,0,14,17,1,35,0,...,4,1283,0,49,1619,28,0,1,4,56
1,1,77,7063,135,0,14,17,0,35,153,...,4,1283,0,49,2092,28,6,1,4,56
2,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,7174,28,83,1,3,56
3,37,77,7063,0,0,14,17,0,35,1,...,4,1283,0,49,7174,28,0,1,2,56
4,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,1439,28,1,2,2,56


In [6]:
X.target.value_counts()

target
0    501078
1     18537
Name: count, dtype: int64

In [7]:
y = X["target"]
X.drop(columns = ["target", "id"], inplace = True)

In [8]:
y.tail(3)

519612    0
519613    0
519614    1
Name: target, dtype: int64

In [9]:
unique_dict = {
    col: {
        'vals': X[col].unique(),
        'count': len(X[col].unique()),
        'high_cardinality': True if len(X[col].unique()) > 0.01 * len(X) else False
    }
    for col in X.columns
}

In [10]:
num_cols = [col for col in X.columns if unique_dict[col]['high_cardinality']]
cat_cols = [col for col in X.columns if not unique_dict[col]['high_cardinality']]

In [11]:
len(num_cols), len(cat_cols)

(31, 195)

Разделяем данные на обучающую и тестовую выборки

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=23)

Catboost

In [17]:
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('sc', StandardScaler(), num_cols)
])

pipeline = Pipeline(steps=[
    ('col_transf', column_transformer),
    ('model', CatBoostClassifier())
])

In [18]:
model_ct = pipeline.fit(X_train, y_train)

Learning rate set to 0.135254
0:	learn: 0.5092706	total: 1.42s	remaining: 23m 43s
1:	learn: 0.3900932	total: 2.11s	remaining: 17m 35s
2:	learn: 0.3102304	total: 2.91s	remaining: 16m 8s
3:	learn: 0.2584198	total: 3.63s	remaining: 15m 3s
4:	learn: 0.2240157	total: 4.45s	remaining: 14m 45s
5:	learn: 0.2005698	total: 5.98s	remaining: 16m 31s
6:	learn: 0.1858711	total: 6.94s	remaining: 16m 24s
7:	learn: 0.1745446	total: 8.07s	remaining: 16m 40s
8:	learn: 0.1665959	total: 8.86s	remaining: 16m 15s
9:	learn: 0.1608528	total: 9.64s	remaining: 15m 54s
10:	learn: 0.1572013	total: 10.2s	remaining: 15m 14s
11:	learn: 0.1540902	total: 10.7s	remaining: 14m 43s
12:	learn: 0.1518398	total: 11.5s	remaining: 14m 29s
13:	learn: 0.1502830	total: 12.1s	remaining: 14m 14s
14:	learn: 0.1490815	total: 13.2s	remaining: 14m 29s
15:	learn: 0.1482022	total: 14s	remaining: 14m 22s
16:	learn: 0.1474431	total: 14.8s	remaining: 14m 13s
17:	learn: 0.1468205	total: 15.5s	remaining: 14m 3s
18:	learn: 0.1463944	total: 16.

In [19]:
pred_ct = model_ct.predict_proba(X_test)
pred_ct = pred_ct[:, 1]

In [20]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary_ct = (pred_ct >= threshold)
    
    print("\tthreshold:", threshold)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary_ct))
    print("\tPRECISION:", precision_score(y_test, pred_binary_ct))
    print("\tRECALL:", recall_score(y_test, pred_binary_ct))
    print("\tROC_AUC:", roc_auc_score(y_test, pred_ct))
    print()

	threshold: 0.1
	F1_SCORE: 0.1460729141179349
	PRECISION: 0.13364674278038952
	RECALL: 0.1610466684650661
	ROC_AUC: 0.6989710480016522

	threshold: 0.3
	F1_SCORE: 0.011674184133722473
	PRECISION: 0.3548387096774194
	RECALL: 0.005934718100890208
	ROC_AUC: 0.6989710480016522

	threshold: 0.5
	F1_SCORE: 0.0010770059235325794
	PRECISION: 0.2857142857142857
	RECALL: 0.0005395198273536552
	ROC_AUC: 0.6989710480016522



XGBoost

In [21]:
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('sc', StandardScaler(), num_cols)
])

pipeline = Pipeline(steps=[
    ('col_transf', column_transformer),
    ('model', XGBClassifier())
])

In [22]:
model_xgb = pipeline.fit(X_train, y_train)

In [23]:
pred_xgb = model_xgb.predict_proba(X_test)
pred_xgb = pred_xgb[:, 1]

In [24]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary_xgb = (pred_xgb >= threshold)
    
    print("\tthreshold:", threshold)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary_xgb))
    print("\tPRECISION:", precision_score(y_test, pred_binary_xgb))
    print("\tRECALL:", recall_score(y_test, pred_binary_xgb))
    print("\tROC_AUC:", roc_auc_score(y_test, pred_xgb))
    print()

	threshold: 0.1
	F1_SCORE: 0.1463575431271289
	PRECISION: 0.12347052280311457
	RECALL: 0.1796601025087672
	ROC_AUC: 0.6935436613645035

	threshold: 0.3
	F1_SCORE: 0.022639567790069464
	PRECISION: 0.24444444444444444
	RECALL: 0.011869436201780416
	ROC_AUC: 0.6935436613645035

	threshold: 0.5
	F1_SCORE: 0.002150537634408602
	PRECISION: 0.3076923076923077
	RECALL: 0.0010790396547073105
	ROC_AUC: 0.6935436613645035



In [25]:
best_threshold = 0.1

In [26]:
for coeff_ct in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    coeff_xgb = 1 - coeff_ct
    
    pred = coeff_ct * pred_ct + coeff_xgb * pred_xgb
    
    pred_binary = (pred >= best_threshold)
    
    print("\tthreshold:", coeff_ct)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary))
    print("\tPRECISION:", precision_score(y_test, pred_binary))
    print("\tRECALL:", recall_score(y_test, pred_binary))
    print("\tROC_AUC:", roc_auc_score(y_test, pred))
    print()

	threshold: 0.1
	F1_SCORE: 0.14870137108460596
	PRECISION: 0.126709726443769
	RECALL: 0.17992986242244402
	ROC_AUC: 0.6955782846521167

	threshold: 0.2
	F1_SCORE: 0.15039637599093997
	PRECISION: 0.129611555729065
	RECALL: 0.17912058268141354
	ROC_AUC: 0.6972083945831038

	threshold: 0.3
	F1_SCORE: 0.15244883881352034
	PRECISION: 0.13283911039871768
	RECALL: 0.17885082276773673
	ROC_AUC: 0.6985509088337899

	threshold: 0.4
	F1_SCORE: 0.1514939309056956
	PRECISION: 0.13351162312281423
	RECALL: 0.17507418397626112
	ROC_AUC: 0.6996204276453715

	threshold: 0.5
	F1_SCORE: 0.15261233636041985
	PRECISION: 0.13558256496227997
	RECALL: 0.17453466414890748
	ROC_AUC: 0.7004295431875243

	threshold: 0.6
	F1_SCORE: 0.15222342083183507
	PRECISION: 0.13697152717860225
	RECALL: 0.17129754518478554
	ROC_AUC: 0.7009419782754011

	threshold: 0.7
	F1_SCORE: 0.15011463738385422
	PRECISION: 0.13580786026200872
	RECALL: 0.16779066630698677
	ROC_AUC: 0.7011472510986736

	threshold: 0.8
	F1_SCORE: 0.1493624772

In [27]:
best_coeff_ct = 0.7

Сохраняем submission

In [28]:
X_test_submit = pd.read_parquet("../../data/input_data/test_sber.parquet")

In [29]:
print(X_test_submit.shape)

(173433, 1078)


In [30]:
X_test_submit.head()

Unnamed: 0,id,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,3,3,1696,458,26,102479,22,16,0,121,...,779,7740,9577,254,0,355,308,779,7740,9577
1,4,3,1688,53,78,103922,191,64,0,0,...,79401,109240,153820,24766,48600,46029,65113,79401,109240,153820
2,12,3,1689,13,81,104111,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,3,1761,1759,44,102433,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,3,1761,1759,77,102010,191,34,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
filtered_columns = pd.read_csv("../../data/intermediate_data/cols_after_preprocessing.csv")

In [32]:
cols = list(set(filtered_columns.cols_after_preprocessing.values) & set(X_test_submit.columns))

In [33]:
X_filtered = X_test_submit[cols]

In [34]:
X_filtered.drop(columns=["id"], inplace=True)

In [35]:
print(X_filtered.shape)

(173433, 226)


In [36]:
X_filtered.head()

Unnamed: 0,feature488,feature490,feature314,feature530,feature549,feature90,feature330,feature603,feature760,feature414,...,feature831,feature185,feature449,feature198,feature227,feature337,feature766,feature749,feature641,feature518
0,58,6,92684,82,17,78982,2076,4,0,101,...,1306,9945,47,28680,0,1332,14,10,0,3
1,75,1,303793,161,17,0,2014,3,12,101,...,0,66467,0,202554,0,1332,14,0,0,1
2,142,3,400876,246,17,0,1350,3,2,101,...,1306,87996,3,342741,0,1332,14,1,0,2
3,115,69,146653,274,17,0,2076,0,0,101,...,1306,234812,47,402837,0,1332,14,0,0,2
4,165,69,139526,363,17,165865,2076,0,0,101,...,1306,234812,47,88432,0,1332,14,0,0,1


In [39]:
pred_ct = model_ct.predict_proba(X_filtered)

pred_ct = pred_ct[:, 1]

In [40]:
pred_xgb = model_xgb.predict_proba(X_filtered)

pred_xgb = pred_xgb[:, 1]

In [41]:
pred = best_coeff_ct * pred_ct + (1 - best_coeff_ct) * pred_xgb
pred_binary = (pred >= best_threshold).astype(int)

In [42]:
submission = pd.read_csv("../../data/intermediate_data/sample_submission.csv")
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03
3,16,1,0.03
4,20,0,0.03
5,23,0,0.03
6,26,0,0.03
7,50,0,0.03
8,51,1,0.03
9,53,0,0.03


In [43]:
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.034106
1,4,0,0.016646
2,12,0,0.096141
3,16,0,0.017985
4,20,0,0.055308
5,23,0,0.013061
6,26,0,0.021822
7,50,0,0.042828
8,51,0,0.054084
9,53,0,0.015871


In [44]:
submission.target_bin.value_counts()

target_bin
0    162783
1     10650
Name: count, dtype: int64

In [45]:
submission.to_csv(f"../../data/output_data/submission_version_{VERSION}_{best_threshold}.csv", index=False)

---