**Шестой подход (запускаем бейзлайн на отфильтрованном датасете + выбросы):**

- используем отфильтрованный трейн
- выкидываем выбросы
- фильтруем тест через csv со списком признаков после фильтрации
- запускаем predict_proba
- считаем метрики
- закидываем submission_version_6

Импортируем библиотеки

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
import warnings

warnings.simplefilter("ignore")

In [2]:
VERSION = 6

Считываем отфильтрованные данные

In [3]:
X = pd.read_parquet("../../data/intermediate_data/filter_train.parquet")

In [4]:
print(X.shape)

(519615, 228)


In [5]:
X.head()

Unnamed: 0,feature547,feature418,feature867,feature15,feature641,feature734,feature549,feature674,feature442,feature763,...,feature717,feature844,feature380,feature550,feature864,feature727,feature454,feature369,feature367,feature409
0,37,77,7063,6,0,14,17,1,35,0,...,4,1283,0,49,1619,28,0,1,4,56
1,1,77,7063,135,0,14,17,0,35,153,...,4,1283,0,49,2092,28,6,1,4,56
2,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,7174,28,83,1,3,56
3,37,77,7063,0,0,14,17,0,35,1,...,4,1283,0,49,7174,28,0,1,2,56
4,37,77,7063,0,0,14,17,0,35,153,...,4,1283,0,49,1439,28,1,2,2,56


In [6]:
X.target.value_counts()

target
0    501078
1     18537
Name: count, dtype: int64

In [7]:
y = X["target"]
X.drop(columns = ["target", "id"], inplace = True)

In [8]:
y.tail(3)

519612    0
519613    0
519614    1
Name: target, dtype: int64

In [10]:
def preprocessing_outliers(data: pd.DataFrame, g_m_int: int = 0.05, q_p_int: int = 0.95) -> pd.DataFrame:
    features = data.columns
    
    for feature in features:
        q_m = data[feature].quantile(g_m_int)
        q_p = data[feature].quantile(q_p_int)
        
        data.loc[data[feature] < q_m, feature] = q_m
        data.loc[data[feature] > q_p, feature] = q_p
        
    return data

In [11]:
X_wo_outliers = preprocessing_outliers(X)

Разделяем данные на обучающую и тестовую выборки

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_wo_outliers,
                                                    y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=23)

In [15]:
model = CatBoostClassifier()
model.fit(X_train, y_train, verbose=False)

<catboost.core.CatBoostClassifier at 0x7f1e03f159d0>

In [16]:
model.fit(X_train, y_train)

Learning rate set to 0.135254
0:	learn: 0.5058446	total: 25ms	remaining: 24.9s
1:	learn: 0.3860458	total: 48.8ms	remaining: 24.4s
2:	learn: 0.3080581	total: 72.9ms	remaining: 24.2s
3:	learn: 0.2572182	total: 96.9ms	remaining: 24.1s
4:	learn: 0.2230390	total: 120ms	remaining: 24s
5:	learn: 0.2004942	total: 144ms	remaining: 23.8s
6:	learn: 0.1846818	total: 167ms	remaining: 23.7s
7:	learn: 0.1736965	total: 190ms	remaining: 23.6s
8:	learn: 0.1663641	total: 213ms	remaining: 23.5s
9:	learn: 0.1607630	total: 237ms	remaining: 23.5s
10:	learn: 0.1568315	total: 261ms	remaining: 23.4s
11:	learn: 0.1536293	total: 284ms	remaining: 23.4s
12:	learn: 0.1514401	total: 306ms	remaining: 23.2s
13:	learn: 0.1500731	total: 329ms	remaining: 23.2s
14:	learn: 0.1487197	total: 353ms	remaining: 23.2s
15:	learn: 0.1477915	total: 375ms	remaining: 23.1s
16:	learn: 0.1470851	total: 398ms	remaining: 23s
17:	learn: 0.1464559	total: 426ms	remaining: 23.2s
18:	learn: 0.1459569	total: 449ms	remaining: 23.2s
19:	learn: 0.

<catboost.core.CatBoostClassifier at 0x7f1e03f159d0>

In [17]:
pred = model.predict_proba(X_test)
pred = pred[:, 1]

In [18]:
for threshold in [0.1, 0.3, 0.5]:
    pred_binary = (pred >= threshold)
    
    print("\tthreshold:", threshold)
    print("\tF1_SCORE:", f1_score(y_test, pred_binary))
    print("\tPRECISION:", precision_score(y_test, pred_binary))
    print("\tRECALL:", recall_score(y_test, pred_binary))
    print("\tROC_AUC:", roc_auc_score(y_test, pred))
    print()

	threshold: 0.1
	F1_SCORE: 0.1492436974789916
	PRECISION: 0.12763510923725566
	RECALL: 0.1796601025087672
	ROC_AUC: 0.6940128448529058

	threshold: 0.3
	F1_SCORE: 0.027425088877602845
	PRECISION: 0.23376623376623376
	RECALL: 0.014567035338548692
	ROC_AUC: 0.6940128448529058

	threshold: 0.5
	F1_SCORE: 0.0037604082728982004
	PRECISION: 0.4375
	RECALL: 0.0018883193957377933
	ROC_AUC: 0.6940128448529058



In [19]:
best_threshold = 0.1

Сохраняем submission

In [20]:
X_test_submit = pd.read_parquet("../../data/input_data/test_sber.parquet")

In [21]:
print(X_test_submit.shape)

(173433, 1078)


In [22]:
X_test_submit.head()

Unnamed: 0,id,sample_ml_new,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,...,feature1067,feature1068,feature1069,feature1070,feature1071,feature1072,feature1073,feature1074,feature1075,feature1076
0,3,3,1696,458,26,102479,22,16,0,121,...,779,7740,9577,254,0,355,308,779,7740,9577
1,4,3,1688,53,78,103922,191,64,0,0,...,79401,109240,153820,24766,48600,46029,65113,79401,109240,153820
2,12,3,1689,13,81,104111,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16,3,1761,1759,44,102433,191,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,3,1761,1759,77,102010,191,34,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
filtered_columns = pd.read_csv("../../data/intermediate_data/cols_after_preprocessing.csv")

In [24]:
cols = list(set(filtered_columns.cols_after_preprocessing.values) & set(X_test_submit.columns))

In [25]:
X_filtered = X_test_submit[cols]

In [26]:
X_filtered.drop(columns=["id"], inplace=True)

In [27]:
print(X_filtered.shape)

(173433, 226)


In [28]:
X_filtered.head()

Unnamed: 0,feature500,feature458,feature752,feature423,feature681,feature641,feature502,feature776,feature488,feature416,...,feature457,feature17,feature337,feature440,feature429,feature766,feature514,feature764,feature782,feature495
0,147,119,17,17,1,0,1,92,58,34,...,129,1,1332,217,2,14,87,57,42,0
1,1,2,17,17,0,0,103,92,75,34,...,129,1,1332,0,1,14,1,57,42,0
2,0,119,17,17,0,0,1,92,142,34,...,129,0,1332,2,2,14,87,0,0,3
3,0,119,17,17,0,0,103,92,115,34,...,129,135,1332,217,2,14,87,57,42,1
4,147,119,17,17,0,0,2,92,165,34,...,129,72,1332,217,1,14,87,57,42,1


In [29]:
X_filtered_wo_outliers = preprocessing_outliers(X_filtered)

In [30]:
pred = model.predict_proba(X_filtered_wo_outliers)

pred = pred[:, 1]
pred_binary = (pred >= best_threshold).astype(int)

In [31]:
submission = pd.read_csv("../../data/intermediate_data/sample_submission.csv")
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.03
1,4,0,0.03
2,12,1,0.03
3,16,1,0.03
4,20,0,0.03
5,23,0,0.03
6,26,0,0.03
7,50,0,0.03
8,51,1,0.03
9,53,0,0.03


In [32]:
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(10)

Unnamed: 0,id,target_bin,target_prob
0,3,0,0.05248
1,4,0,0.013531
2,12,0,0.010884
3,16,0,0.018454
4,20,0,0.065603
5,23,0,0.009841
6,26,0,0.022896
7,50,0,0.049525
8,51,0,0.043843
9,53,0,0.017601


In [33]:
submission.target_bin.value_counts()

target_bin
0    162403
1     11030
Name: count, dtype: int64

In [34]:
submission.to_csv(f"../../data/output_data/submission_version_{VERSION}_{best_threshold}.csv", index=False)

---