In [15]:
import pandas as pd
import numpy as np
import catboost as catb

import dask.dataframe as dd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

from sklearn.metrics import f1_score, classification_report
from sklearn.feature_selection import SelectFromModel

import pickle

In [2]:
from until import ColumnSelector, preprocess_data_train, preprocess_data_test, select_type_cols

In [3]:
TRAIN_DATA = 'data/data_train.csv'
FEATURES_DATA = 'data/features.csv'
RANDOM_STATE = 21

### Data preparation

In [4]:
train_df = pd.read_csv(TRAIN_DATA)

In [5]:
data_train, true_offers_ids = preprocess_data_train(train_df, FEATURES_DATA)

In [6]:
with open("data/offer_mark.txt", "w") as file:
    print(*true_offers_ids, file=file, sep="\n")

In [7]:
data_prelim = data_train.copy()

X_train = data_prelim.drop('target', axis=1)
y_train = data_prelim['target']

In [9]:
f_all, f_binary, f_categorical, f_numeric = select_type_cols(X_train)

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X_train, y_train, test_size=0.20, random_state=42, shuffle=False)

In [13]:
ros = RandomUnderSampler(random_state=42)
X_train, Y_train = ros.fit_resample(X_train, Y_train)

print("Train")
print(Y_train.value_counts()/y_train.shape[0])

Train
0.0    0.056824
1.0    0.056824
Name: target, dtype: float64


### Model Training

In [14]:
f_prep_pipeline = make_pipeline(
    ColumnSelector(columns=f_all),
    FeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            ColumnSelector(f_numeric),
            SimpleImputer(strategy="mean"),
            StandardScaler()
        )),
        ("categorical_features", make_pipeline(
            ColumnSelector(f_categorical),
            SimpleImputer(strategy="most_frequent"),
            OneHotEncoder(handle_unknown='ignore')
        )),
        ("boolean_features", make_pipeline(
            ColumnSelector(f_binary),
        ))
    ])
)

In [16]:
catb_model = catb.CatBoostClassifier(random_state=21,
                                    score_function='L2',
                                    depth = 7,
                                    eval_metric='AUC',
                                    l2_leaf_reg = 4,
                                    learning_rate=0.01,
                                    custom_metric=['Precision', 'Recall'],
                                    iterations=1000,                                    
                                )

In [17]:
cat_boost_pipe = make_pipeline(
    f_prep_pipeline,
    catb_model
)

In [18]:
cat_boost_pipe.fit(X_train, Y_train)

0:	total: 248ms	remaining: 4m 7s
1:	total: 316ms	remaining: 2m 37s
2:	total: 384ms	remaining: 2m 7s
3:	total: 454ms	remaining: 1m 53s
4:	total: 524ms	remaining: 1m 44s
5:	total: 592ms	remaining: 1m 38s
6:	total: 665ms	remaining: 1m 34s
7:	total: 736ms	remaining: 1m 31s
8:	total: 804ms	remaining: 1m 28s
9:	total: 874ms	remaining: 1m 26s
10:	total: 944ms	remaining: 1m 24s
11:	total: 1.01s	remaining: 1m 23s
12:	total: 1.09s	remaining: 1m 22s
13:	total: 1.16s	remaining: 1m 21s
14:	total: 1.23s	remaining: 1m 20s
15:	total: 1.3s	remaining: 1m 19s
16:	total: 1.37s	remaining: 1m 19s
17:	total: 1.44s	remaining: 1m 18s
18:	total: 1.51s	remaining: 1m 17s
19:	total: 1.58s	remaining: 1m 17s
20:	total: 1.65s	remaining: 1m 16s
21:	total: 1.72s	remaining: 1m 16s
22:	total: 1.78s	remaining: 1m 15s
23:	total: 1.85s	remaining: 1m 15s
24:	total: 1.92s	remaining: 1m 15s
25:	total: 1.99s	remaining: 1m 14s
26:	total: 2.07s	remaining: 1m 14s
27:	total: 2.14s	remaining: 1m 14s
28:	total: 2.21s	remaining: 1m 13

238:	total: 16.6s	remaining: 53s
239:	total: 16.7s	remaining: 52.9s
240:	total: 16.8s	remaining: 52.8s
241:	total: 16.8s	remaining: 52.7s
242:	total: 16.9s	remaining: 52.6s
243:	total: 16.9s	remaining: 52.5s
244:	total: 17s	remaining: 52.4s
245:	total: 17.1s	remaining: 52.4s
246:	total: 17.2s	remaining: 52.3s
247:	total: 17.2s	remaining: 52.2s
248:	total: 17.3s	remaining: 52.2s
249:	total: 17.4s	remaining: 52.1s
250:	total: 17.4s	remaining: 52s
251:	total: 17.5s	remaining: 51.9s
252:	total: 17.6s	remaining: 51.8s
253:	total: 17.6s	remaining: 51.8s
254:	total: 17.7s	remaining: 51.7s
255:	total: 17.8s	remaining: 51.6s
256:	total: 17.8s	remaining: 51.5s
257:	total: 17.9s	remaining: 51.5s
258:	total: 18s	remaining: 51.4s
259:	total: 18s	remaining: 51.3s
260:	total: 18.1s	remaining: 51.2s
261:	total: 18.1s	remaining: 51.1s
262:	total: 18.2s	remaining: 51s
263:	total: 18.3s	remaining: 50.9s
264:	total: 18.3s	remaining: 50.8s
265:	total: 18.4s	remaining: 50.8s
266:	total: 18.5s	remaining: 50.

475:	total: 31.8s	remaining: 35s
476:	total: 31.9s	remaining: 35s
477:	total: 31.9s	remaining: 34.9s
478:	total: 32s	remaining: 34.8s
479:	total: 32.1s	remaining: 34.7s
480:	total: 32.1s	remaining: 34.6s
481:	total: 32.2s	remaining: 34.6s
482:	total: 32.2s	remaining: 34.5s
483:	total: 32.3s	remaining: 34.4s
484:	total: 32.3s	remaining: 34.3s
485:	total: 32.4s	remaining: 34.3s
486:	total: 32.5s	remaining: 34.2s
487:	total: 32.5s	remaining: 34.1s
488:	total: 32.6s	remaining: 34.1s
489:	total: 32.6s	remaining: 34s
490:	total: 32.7s	remaining: 33.9s
491:	total: 32.8s	remaining: 33.8s
492:	total: 32.8s	remaining: 33.7s
493:	total: 32.9s	remaining: 33.7s
494:	total: 32.9s	remaining: 33.6s
495:	total: 33s	remaining: 33.5s
496:	total: 33s	remaining: 33.4s
497:	total: 33.1s	remaining: 33.4s
498:	total: 33.2s	remaining: 33.3s
499:	total: 33.2s	remaining: 33.2s
500:	total: 33.3s	remaining: 33.2s
501:	total: 33.4s	remaining: 33.1s
502:	total: 33.4s	remaining: 33s
503:	total: 33.5s	remaining: 33s
5

713:	total: 46.2s	remaining: 18.5s
714:	total: 46.3s	remaining: 18.4s
715:	total: 46.3s	remaining: 18.4s
716:	total: 46.4s	remaining: 18.3s
717:	total: 46.5s	remaining: 18.2s
718:	total: 46.5s	remaining: 18.2s
719:	total: 46.6s	remaining: 18.1s
720:	total: 46.6s	remaining: 18s
721:	total: 46.7s	remaining: 18s
722:	total: 46.7s	remaining: 17.9s
723:	total: 46.8s	remaining: 17.8s
724:	total: 46.8s	remaining: 17.8s
725:	total: 46.9s	remaining: 17.7s
726:	total: 47s	remaining: 17.6s
727:	total: 47s	remaining: 17.6s
728:	total: 47.1s	remaining: 17.5s
729:	total: 47.1s	remaining: 17.4s
730:	total: 47.2s	remaining: 17.4s
731:	total: 47.3s	remaining: 17.3s
732:	total: 47.3s	remaining: 17.2s
733:	total: 47.4s	remaining: 17.2s
734:	total: 47.4s	remaining: 17.1s
735:	total: 47.5s	remaining: 17s
736:	total: 47.6s	remaining: 17s
737:	total: 47.6s	remaining: 16.9s
738:	total: 47.7s	remaining: 16.8s
739:	total: 47.8s	remaining: 16.8s
740:	total: 47.8s	remaining: 16.7s
741:	total: 47.9s	remaining: 16.

954:	total: 1m	remaining: 2.86s
955:	total: 1m	remaining: 2.8s
956:	total: 1m	remaining: 2.73s
957:	total: 1m	remaining: 2.67s
958:	total: 1m 1s	remaining: 2.61s
959:	total: 1m 1s	remaining: 2.54s
960:	total: 1m 1s	remaining: 2.48s
961:	total: 1m 1s	remaining: 2.42s
962:	total: 1m 1s	remaining: 2.35s
963:	total: 1m 1s	remaining: 2.29s
964:	total: 1m 1s	remaining: 2.23s
965:	total: 1m 1s	remaining: 2.16s
966:	total: 1m 1s	remaining: 2.1s
967:	total: 1m 1s	remaining: 2.03s
968:	total: 1m 1s	remaining: 1.97s
969:	total: 1m 1s	remaining: 1.91s
970:	total: 1m 1s	remaining: 1.84s
971:	total: 1m 1s	remaining: 1.78s
972:	total: 1m 1s	remaining: 1.72s
973:	total: 1m 1s	remaining: 1.65s
974:	total: 1m 1s	remaining: 1.59s
975:	total: 1m 2s	remaining: 1.52s
976:	total: 1m 2s	remaining: 1.46s
977:	total: 1m 2s	remaining: 1.4s
978:	total: 1m 2s	remaining: 1.33s
979:	total: 1m 2s	remaining: 1.27s
980:	total: 1m 2s	remaining: 1.21s
981:	total: 1m 2s	remaining: 1.14s
982:	total: 1m 2s	remaining: 1.08s


In [19]:
preds_train = cat_boost_pipe.predict(X_train)
f1_score(Y_train, preds_train, average='macro')

0.8861118699005465

In [20]:
preds_test = cat_boost_pipe.predict(X_test)
f1_score(Y_test, preds_test, average='macro')

0.7312543163352377

In [21]:
print(classification_report(Y_test, preds_test))

              precision    recall  f1-score   support

         0.0       1.00      0.86      0.92    153403
         1.0       0.37      1.00      0.54     12928

    accuracy                           0.87    166331
   macro avg       0.68      0.93      0.73    166331
weighted avg       0.95      0.87      0.89    166331



In [22]:
with open('data/cat_boost_model.pickle', 'wb') as f:
    pickle.dump(cat_boost_pipe, f, protocol=pickle.HIGHEST_PROTOCOL)