**13 подход (запускаем catboost и xgboost на отфильтрованном датасете + катег + выбросы):**

- используем отфильтрованный трейн
- выбросы
- columntransformer
- фильтруем тест через csv со списком признаков после фильтрации
- запускаем predict_proba
- считаем метрики
- закидываем submission_version_13

Импортируем библиотеки

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import warnings

warnings.simplefilter("ignore")

In [2]:
VERSION = 13

Считываем отфильтрованные данные

In [3]:
X = pd.read_parquet("../../data/intermediate_data/filter_train.parquet")

In [4]:
print(X.shape)

(519615, 228)


In [5]:
X.head()

Unnamed: 0,feature547,feature418,feature867,feature15,feature641,feature734,feature549,feature674,feature442,feature763,...,feature717,feature844,feature380,feature550,feature864,feature727,feature454,feature369,feature367,feature409
0,37,77,7063,6,0,14,17,1,35,0,...,4,1283,0,49,1619,28,0,1,4,56
1,1,77,7063,135,0,14,17,0,35,153,...,4,1283,0,49,2092,28,6,1,4,56
2,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,7174,28,83,1,3,56
3,37,77,7063,0,0,14,17,0,35,1,...,4,1283,0,49,7174,28,0,1,2,56
4,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,1439,28,1,2,2,56


In [6]:
X.target.value_counts()

target
0    501078
1     18537
Name: count, dtype: int64

In [7]:
y = X["target"]
X.drop(columns = ["target", "id"], inplace = True)

In [8]:
y.tail(3)

519612    0
519613    0
519614    1
Name: target, dtype: int64

In [9]:
unique_dict = {
    col: {
        'vals': X[col].unique(),
        'count': len(X[col].unique()),
        'high_cardinality': True if len(X[col].unique()) > 0.01 * len(X) else False
    }
    for col in X.columns
}

In [10]:
num_cols = [col for col in X.columns if unique_dict[col]['high_cardinality']]
cat_cols = [col for col in X.columns if not unique_dict[col]['high_cardinality']]

In [11]:
len(num_cols), len(cat_cols)

(31, 195)

In [12]:
def preprocessing_outliers(data: pd.DataFrame, g_m_int: int = 0.05, q_p_int: int = 0.95) -> pd.DataFrame:
    features = data.columns
    
    for feature in features:
        q_m = data[feature].quantile(g_m_int)
        q_p = data[feature].quantile(q_p_int)
        
        data.loc[data[feature] < q_m, feature] = q_m
        data.loc[data[feature] > q_p, feature] = q_p
        
    return data

In [13]:
X_wo_outliers = preprocessing_outliers(X)

Разделяем данные на обучающую и тестовую выборки

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_wo_outliers,
                                                    y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=23)

Catboost

In [15]:
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('sc', StandardScaler(), num_cols)
])

pipeline = Pipeline(steps=[
    ('col_transf', column_transformer),
    ('model', CatBoostClassifier())
])

In [16]:
model_ct = pipeline.fit(X_train, y_train)

Learning rate set to 0.135254
0:	learn: 0.5105121	total: 210ms	remaining: 3m 29s
1:	learn: 0.3900239	total: 312ms	remaining: 2m 35s
2:	learn: 0.3090650	total: 476ms	remaining: 2m 38s
3:	learn: 0.2579033	total: 593ms	remaining: 2m 27s
4:	learn: 0.2245738	total: 733ms	remaining: 2m 25s
5:	learn: 0.2012614	total: 863ms	remaining: 2m 22s
6:	learn: 0.1855506	total: 1.02s	remaining: 2m 24s
7:	learn: 0.1740757	total: 1.15s	remaining: 2m 22s
8:	learn: 0.1663377	total: 1.28s	remaining: 2m 21s
9:	learn: 0.1611144	total: 1.4s	remaining: 2m 18s
10:	learn: 0.1570120	total: 1.55s	remaining: 2m 19s
11:	learn: 0.1543208	total: 1.66s	remaining: 2m 16s
12:	learn: 0.1521475	total: 1.84s	remaining: 2m 19s
13:	learn: 0.1506045	total: 1.92s	remaining: 2m 15s
14:	learn: 0.1492721	total: 2.03s	remaining: 2m 13s
15:	learn: 0.1482490	total: 2.16s	remaining: 2m 13s
16:	learn: 0.1475233	total: 2.27s	remaining: 2m 11s
17:	learn: 0.1468974	total: 2.43s	remaining: 2m 12s
18:	learn: 0.1464583	total: 2.52s	remaining: 

In [25]:
pred_ct = model_ct.predict_proba(X_test)
pred_ct = pred_ct[:, 1]

In [18]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary_ct = (pred_ct >= threshold)
    
    print("\tthreshold:", threshold)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary_ct))
    print("\tPRECISION:", precision_score(y_test, pred_binary_ct))
    print("\tRECALL:", recall_score(y_test, pred_binary_ct))
    print("\tROC_AUC:", roc_auc_score(y_test, pred_ct))
    print()

	threshold: 0.1
	F1_SCORE: 0.14944895240402084
	PRECISION: 0.1356043956043956
	RECALL: 0.16644186673860265
	ROC_AUC: 0.700716783552221

	threshold: 0.3
	F1_SCORE: 0.00480128034142438
	PRECISION: 0.21428571428571427
	RECALL: 0.0024278392230914487
	ROC_AUC: 0.700716783552221

	threshold: 0.5
	F1_SCORE: 0.0
	PRECISION: 0.0
	RECALL: 0.0
	ROC_AUC: 0.700716783552221



XGBoost

In [19]:
column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(handle_unknown='ignore'), cat_cols),
    ('sc', StandardScaler(), num_cols)
])

pipeline = Pipeline(steps=[
    ('col_transf', column_transformer),
    ('model', XGBClassifier())
])

In [20]:
model_xgb = pipeline.fit(X_train, y_train)

In [21]:
pred_xgb = model_xgb.predict_proba(X_test)
pred_xgb = pred_xgb[:, 1]

In [22]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary_xgb = (pred_xgb >= threshold)
    
    print("\tthreshold:", threshold)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary_xgb))
    print("\tPRECISION:", precision_score(y_test, pred_binary_xgb))
    print("\tRECALL:", recall_score(y_test, pred_binary_xgb))
    print("\tROC_AUC:", roc_auc_score(y_test, pred_xgb))
    print()

	threshold: 0.1
	F1_SCORE: 0.14112631815122278
	PRECISION: 0.12079892452467832
	RECALL: 0.16967898570272458
	ROC_AUC: 0.6934156575721449

	threshold: 0.3
	F1_SCORE: 0.01854714064914992
	PRECISION: 0.2057142857142857
	RECALL: 0.009711356892365795
	ROC_AUC: 0.6934156575721449

	threshold: 0.5
	F1_SCORE: 0.0005382131324004305
	PRECISION: 0.1111111111111111
	RECALL: 0.0002697599136768276
	ROC_AUC: 0.6934156575721449



In [23]:
best_threshold = 0.1

In [24]:
for coeff_ct in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    coeff_xgb = 1 - coeff_ct
    
    pred = coeff_ct * pred_ct + coeff_xgb * pred_xgb
    
    pred_binary = (pred >= best_threshold)
    
    print("\tthreshold:", coeff_ct)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary))
    print("\tPRECISION:", precision_score(y_test, pred_binary))
    print("\tRECALL:", recall_score(y_test, pred_binary))
    print("\tROC_AUC:", roc_auc_score(y_test, pred))
    print()

	threshold: 0.1
	F1_SCORE: 0.14246700045516614
	PRECISION: 0.123204093682346
	RECALL: 0.1688697059616941
	ROC_AUC: 0.6959802300459657

	threshold: 0.2
	F1_SCORE: 0.14186851211072665
	PRECISION: 0.12391698569413662
	RECALL: 0.165902346911249
	ROC_AUC: 0.698108029467249

	threshold: 0.3
	F1_SCORE: 0.1435082388687624
	PRECISION: 0.1265979381443299
	RECALL: 0.16563258699757216
	ROC_AUC: 0.6998457381153014

	threshold: 0.4
	F1_SCORE: 0.14515747099218565
	PRECISION: 0.12935218400506435
	RECALL: 0.16536282708389533
	ROC_AUC: 0.7012388202367699

	threshold: 0.5
	F1_SCORE: 0.1471954269381922
	PRECISION: 0.13176972281449895
	RECALL: 0.16671162665227948
	ROC_AUC: 0.7022750497447231

	threshold: 0.6
	F1_SCORE: 0.14798798798798798
	PRECISION: 0.13339107838891295
	RECALL: 0.1661721068249258
	ROC_AUC: 0.702936986564914

	threshold: 0.7
	F1_SCORE: 0.14774513359932293
	PRECISION: 0.1338737949167397
	RECALL: 0.16482330725654168
	ROC_AUC: 0.7032008003257878

	threshold: 0.8
	F1_SCORE: 0.1488145896656535


In [26]:
best_coeff_ct = 0.7

Сохраняем submission

In [27]:
X_test_submit = pd.read_parquet("../../data/input_data/test_sber.parquet")

In [28]:
print(X_test_submit.shape)

(173433, 1078)


In [29]:
X_test_submit.head()

Unnamed: 0,id,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,3,3,1696,458,26,102479,22,16,0,121,...,779,7740,9577,254,0,355,308,779,7740,9577
1,4,3,1688,53,78,103922,191,64,0,0,...,79401,109240,153820,24766,48600,46029,65113,79401,109240,153820
2,12,3,1689,13,81,104111,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,3,1761,1759,44,102433,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,3,1761,1759,77,102010,191,34,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
filtered_columns = pd.read_csv("../../data/intermediate_data/cols_after_preprocessing.csv")

In [31]:
cols = list(set(filtered_columns.cols_after_preprocessing.values) & set(X_test_submit.columns))

In [32]:
X_filtered = X_test_submit[cols]

In [33]:
X_filtered.drop(columns=["id"], inplace=True)

In [34]:
print(X_filtered.shape)

(173433, 226)


In [35]:
X_filtered.head()

Unnamed: 0,feature549,feature781,feature488,feature103,feature433,feature506,feature551,feature451,feature367,feature776,...,feature987,feature760,feature362,feature481,feature556,feature290,feature177,feature547,feature373,feature480
0,17,79,58,213300,0,56,0,168,6,92,...,0,0,18,101,13,17213,193366,0,1,24
1,17,79,75,322921,5,40,1,13,5,92,...,0,12,18,101,13,13415,555802,0,0,24
2,17,79,142,565716,209,143,1,3,6,92,...,0,2,18,0,13,21063,596468,37,0,24
3,17,79,115,340738,209,190,1,168,3,92,...,0,0,18,2,13,339,416773,37,0,24
4,17,79,165,328274,209,277,1,8,2,92,...,0,0,18,101,13,32976,311784,37,0,24


In [36]:
X_filtered_wo_outliers = preprocessing_outliers(X_filtered)

In [37]:
pred_ct = model_ct.predict_proba(X_filtered_wo_outliers)

pred_ct = pred_ct[:, 1]

In [38]:
pred_xgb = model_xgb.predict_proba(X_filtered_wo_outliers)

pred_xgb = pred_xgb[:, 1]

In [39]:
pred = best_coeff_ct * pred_ct + (1 - best_coeff_ct) * pred_xgb
pred_binary = (pred >= best_threshold).astype(int)

In [40]:
submission = pd.read_csv("../../data/intermediate_data/sample_submission.csv")
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03
3,16,1,0.03
4,20,0,0.03
5,23,0,0.03
6,26,0,0.03
7,50,0,0.03
8,51,1,0.03
9,53,0,0.03


In [41]:
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.039197
1,4,0,0.015897
2,12,0,0.017732
3,16,0,0.017288
4,20,0,0.061366
5,23,0,0.012605
6,26,0,0.016944
7,50,0,0.049112
8,51,0,0.048933
9,53,0,0.017004


In [42]:
submission.target_bin.value_counts()

target_bin
0    163067
1     10366
Name: count, dtype: int64

In [43]:
submission.to_csv(f"../../data/output_data/submission_version_{VERSION}_{best_threshold}.csv", index=False)

---