In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.svm import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.compose import *
from sklearn.pipeline import *
from sklearn.metrics import *
from sklearn.impute import *
from sklearn.neural_network import *
from sklearn.multioutput import *
from sklearn.ensemble import *

In [18]:
with open("../data/processed/train.pkl", "rb") as f:
    train = pickle.load(f)

In [19]:
with open("../data/processed/target.pkl", "rb") as f:
    target = pickle.load(f)

In [20]:
TARGET_COLS = ['Артериальная гипертензия', 'ОНМК', 'Стенокардия, ИБС, инфаркт миокарда', 'Сердечная недостаточность', 'Прочие заболевания сердца']
ID_COL = 'ID'
EDU_COL = 'Образование'
SEX_COL = 'Пол'
CAT_COLS = [
    'Пол', 'Семья', 'Этнос', 'Национальность', 'Религия', 'Образование', 
    'Профессия', 'Статус Курения', 'Алкоголь',
    'Время засыпания', 'Время пробуждения'
]
OHE_COLS = [
    'Вы работаете?', 'Выход на пенсию', 'Прекращение работы по болезни', 'Сахарный диабет', 'Гепатит',
    'Онкология', 'Хроническое заболевание легких', 'Бронжиальная астма', 'Туберкулез легких ', 'ВИЧ/СПИД',
    'Регулярный прим лекарственных средств', 'Травмы за год', 'Переломы','Пассивное курение', 'Сон после обеда', 
    'Спорт, клубы', 'Религия, клубы'
]
REAL_COLS = ['Возраст курения', 'Сигарет в день', 'Возраст алког', 'Частота пасс кур']

In [21]:
train[CAT_COLS] = train[CAT_COLS].astype('object')

In [22]:
train_data, val_data, train_target, val_target = train_test_split(train, target, train_size=0.8, random_state=42)

In [23]:
real_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
]
)

In [24]:
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [25]:
import category_encoders as ce

In [26]:
preprocess_pipe = ColumnTransformer(transformers=[
    ('real_cols', real_pipe, REAL_COLS),
    ('cat_cols', cat_pipe, CAT_COLS),
    ('woe_cat_cols', ce.WOEEncoder(), CAT_COLS),
    ('ohe_cols', 'passthrough', OHE_COLS)
]
)

In [27]:
model_pipe = Pipeline([
    ('preprocess', preprocess_pipe),
    ('model', RandomForestClassifier())
]
)

In [28]:
multiout_model_pipe = MultiOutputClassifier(model_pipe, n_jobs=4)

In [29]:
fbeta_score_recall = make_scorer(fbeta_score, beta=2.0, average='micro')

In [30]:
params = {'estimator__model__n_estimators': np.arange(10, 100, 5)}

In [31]:
scores = RandomizedSearchCV(multiout_model_pipe, params, scoring=fbeta_score_recall, error_score='raise')

In [32]:
scores.fit(X=train_data, y=train_target)

In [33]:
scores.best_score_

0.40326039351144216

In [34]:
from catboost import CatBoostClassifier

In [35]:
cat_boost_model = CatBoostClassifier(loss_function='MultiLogloss',
                silent=True,
                random_seed=42,
                cat_features=CAT_COLS)

In [36]:
grid = {'learning_rate': [0.01, 0.05, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 2, 3, 4, 5],
        'early_stopping_rounds': [50, 100, 150]}

In [40]:
cat_boost_model.randomized_search(grid, train_data, train_target, 4)

Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3516897318
bestIteration = 107

0:	loss: 0.3516897	best: 0.3516897 (0)	total: 2.64s	remaining: 23.8s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3514754725
bestIteration = 492

1:	loss: 0.3514755	best: 0.3514755 (1)	total: 11.3s	remaining: 45.1s

bestTest = 0.349261987
bestIteration = 999

2:	loss: 0.3492620	best: 0.3492620 (2)	total: 30.1s	remaining: 1m 10s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3538703608
bestIteration = 318

3:	loss: 0.3538704	best: 0.3492620 (2)	total: 1m 12s	remaining: 1m 48s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3521459721
bestIteration = 27

4:	loss: 0.3521460	best: 0.3492620 (2)	total: 1m 29s	remaining: 1m 29s
Stopped by overfitting detector  (150 iterations wait)

bestTest = 0.350885673
bestIteration = 60

5:	loss: 0.3508857	best: 0.3492620 (2)	total: 1m 58s	remaining: 1m 19s
Stopped by overfitting detector  (1

{'params': {'depth': 4,
  'od_wait': 100,
  'l2_leaf_reg': 5,
  'learning_rate': 0.01},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45,
 

In [43]:
cat_boost_model.predict(train_data)

array([[0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [46]:
recall_score(val_target, cat_boost_model.predict(val_data), average='micro')

0.4489795918367347