**Четвертый подход (убираем гридсерч):**

- используем отфильтрованный трейн
- работа с выбросами
- делаем oversampling при помощи ADASYN
- фильтруем тест через csv со списком признаков после фильтрации 
- запускаем predict_proba
- считаем метрики
- закидываем submission_version_4

Импортируем библиотеки

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import ADASYN
import warnings

warnings.simplefilter("ignore")

In [6]:
VERSION = 4

Считываем отфильтрованные данные

In [7]:
X = pd.read_parquet("../../data/intermediate_data/filter_train.parquet")

In [8]:
print(X.shape)

(519615, 228)


In [9]:
X.head()

Unnamed: 0,feature547,feature418,feature867,feature15,feature641,feature734,feature549,feature674,feature442,feature763,...,feature717,feature844,feature380,feature550,feature864,feature727,feature454,feature369,feature367,feature409
0,37,77,7063,6,0,14,17,1,35,0,...,4,1283,0,49,1619,28,0,1,4,56
1,1,77,7063,135,0,14,17,0,35,153,...,4,1283,0,49,2092,28,6,1,4,56
2,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,7174,28,83,1,3,56
3,37,77,7063,0,0,14,17,0,35,1,...,4,1283,0,49,7174,28,0,1,2,56
4,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,1439,28,1,2,2,56


In [10]:
X.target.value_counts()

target
0    501078
1     18537
Name: count, dtype: int64

In [11]:
y = X["target"]
X.drop(columns = ["target", "id"], inplace = True)

In [12]:
y.tail(3)

519612    0
519613    0
519614    1
Name: target, dtype: int64

In [14]:
def preprocessing_outliers(data: pd.DataFrame, g_m_int: int = 0.05, q_p_int: int = 0.95) -> pd.DataFrame:
    features = data.columns
    
    for feature in features:
        q_m = data[feature].quantile(g_m_int)
        q_p = data[feature].quantile(q_p_int)
        
        data.loc[data[feature] < q_m, feature] = q_m
        data.loc[data[feature] > q_p, feature] = q_p
        
    return data

In [15]:
X_wo_outliers = preprocessing_outliers(X)

Делаем oversampling через ADASYN

In [16]:
X_resampled, y_resampled = ADASYN().fit_resample(X_wo_outliers, y)

In [17]:
X_resampled.shape

(994992, 226)

Разделяем данные на обучающую и тестовую выборки

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                    y_resampled, 
                                                    test_size=0.2, 
                                                    stratify=y_resampled, 
                                                    random_state=23)

In [20]:
model = CatBoostClassifier()
model.fit(X_train, y_train, verbose=False)

<catboost.core.CatBoostClassifier at 0x7f200070e190>

In [21]:
model.fit(X_train, y_train)

Learning rate set to 0.178493
0:	learn: 0.5863886	total: 42.4ms	remaining: 42.4s
1:	learn: 0.5020691	total: 80.8ms	remaining: 40.3s
2:	learn: 0.4251160	total: 121ms	remaining: 40.4s
3:	learn: 0.3596846	total: 161ms	remaining: 40s
4:	learn: 0.3231730	total: 199ms	remaining: 39.7s
5:	learn: 0.2924695	total: 237ms	remaining: 39.3s
6:	learn: 0.2741622	total: 279ms	remaining: 39.6s
7:	learn: 0.2472708	total: 318ms	remaining: 39.4s
8:	learn: 0.2239565	total: 358ms	remaining: 39.4s
9:	learn: 0.2106070	total: 395ms	remaining: 39.1s
10:	learn: 0.1966388	total: 430ms	remaining: 38.7s
11:	learn: 0.1848087	total: 464ms	remaining: 38.2s
12:	learn: 0.1752156	total: 502ms	remaining: 38.1s
13:	learn: 0.1586408	total: 539ms	remaining: 38s
14:	learn: 0.1521506	total: 576ms	remaining: 37.8s
15:	learn: 0.1479008	total: 611ms	remaining: 37.6s
16:	learn: 0.1422005	total: 649ms	remaining: 37.5s
17:	learn: 0.1364721	total: 686ms	remaining: 37.4s
18:	learn: 0.1324826	total: 723ms	remaining: 37.3s
19:	learn: 0.

<catboost.core.CatBoostClassifier at 0x7f200070e190>

In [22]:
pred = model.predict_proba(X_test)
pred = pred[:, 1]

In [23]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary = (pred >= threshold)
    
    print("\tthreshold:", threshold)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary))
    print("\tPRECISION:", precision_score(y_test, pred_binary))
    print("\tRECALL:", recall_score(y_test, pred_binary))
    print("\tROC_AUC:", roc_auc_score(y_test, pred))
    print()

	threshold: 0.1
	F1_SCORE: 0.9541887877717351
	PRECISION: 0.9382644773665773
	RECALL: 0.9706629683245093
	ROC_AUC: 0.987993353951953

	threshold: 0.3
	F1_SCORE: 0.9792054656391156
	PRECISION: 0.9955332391861499
	RECALL: 0.9634046344006559
	ROC_AUC: 0.987993353951953

	threshold: 0.5
	F1_SCORE: 0.980369098092615
	PRECISION: 0.9993795678006204
	RECALL: 0.9620683720883147
	ROC_AUC: 0.987993353951953



In [24]:
best_threshold = 0.1

Сохраняем submission

In [25]:
X_test_submit = pd.read_parquet("../../data/input_data/test_sber.parquet")
#X.drop(columns = ["sample_ml_new", "id"], inplace = True)

In [26]:
print(X_test_submit.shape)

(173433, 1078)


In [27]:
X_test_submit.head()

Unnamed: 0,id,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,3,3,1696,458,26,102479,22,16,0,121,...,779,7740,9577,254,0,355,308,779,7740,9577
1,4,3,1688,53,78,103922,191,64,0,0,...,79401,109240,153820,24766,48600,46029,65113,79401,109240,153820
2,12,3,1689,13,81,104111,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,3,1761,1759,44,102433,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,3,1761,1759,77,102010,191,34,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
filtered_columns = pd.read_csv("../../data/intermediate_data/cols_after_preprocessing.csv")

In [32]:
cols = list(set(filtered_columns.cols_after_preprocessing.values) & set(X_test_submit.columns))

In [33]:
X_filtered = X_test_submit[cols]

In [35]:
X_filtered.drop(columns=["id"], inplace=True)

In [36]:
print(X_filtered.shape)

(173433, 226)


In [37]:
X_filtered.head()

Unnamed: 0,feature740,feature63,feature844,feature873,feature767,feature22,feature547,feature800,feature653,feature584,...,feature766,feature399,feature530,feature458,feature505,feature483,feature813,feature771,feature17,feature529
0,11,75855,1283,44,55,119624,0,0,0,0,...,14,72,82,119,8,16,31,20,1,0
1,11,159808,1283,33,55,140184,0,3,1,0,...,14,72,161,2,37,16,31,20,1,7
2,11,159808,139,26,0,140184,37,1,1,0,...,14,72,246,119,11,16,13,20,0,3
3,11,159808,1283,4121,55,140184,37,1,0,0,...,14,72,274,119,1,16,31,20,135,360
4,11,159808,1283,2199,55,140184,37,1,0,0,...,14,72,363,119,24,16,31,20,72,0


In [38]:
X_filtered_wo_outliers = preprocessing_outliers(X_filtered)

In [61]:
pred = model.predict_proba(X_filtered_wo_outliers)

pred = pred[:, 1]
pred_binary = (pred >= best_threshold).astype(int)

In [62]:
submission = pd.read_csv("../../data/intermediate_data/sample_submission.csv")
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03
3,16,1,0.03
4,20,0,0.03
5,23,0,0.03
6,26,0,0.03
7,50,0,0.03
8,51,1,0.03
9,53,0,0.03


In [63]:
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.049584
1,4,0,0.016115
2,12,0,0.009852
3,16,0,0.014637
4,20,0,0.074461
5,23,0,0.010707
6,26,0,0.025657
7,50,0,0.092706
8,51,0,0.04532
9,53,0,0.018948


In [64]:
submission.target_bin.value_counts()

target_bin
0    154556
1     18877
Name: count, dtype: int64

In [65]:
submission.to_csv(f"../../data/output_data/submission_version_{VERSION}.csv", index=False)

---