# Pipeline samples

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
sns.set_style("whitegrid")
from catboost import CatBoostClassifier
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.metrics import make_scorer
from sklearn.ensemble import GradientBoostingRegressor

## Sample 1

In [None]:
scorer = make_scorer(mape, greater_is_better=False)

lb = LabelEncoder().fit(x_train['building_type'].values)

x_train['building_type'] = lb.transform(x_train['building_type'].values)
x_test['building_type'] = lb.transform(x_test['building_type'].values)


def hyperopt(estimator, params):
    column_transformer = ColumnTransformer(  # OHE for cat, Scaler for real
        transformers=[
            ('real', StandardScaler(), real_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)

        ], n_jobs=4
    )

    pipeline = Pipeline(  # column transformer and then model
        steps=[
            ('column_transformer', column_transformer),
            ('model', estimator)

        ]
    )

    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=params,
        scoring=scorer,
        cv=3,
        verbose=2
    )

    grid.fit(x_train, y_train)

    # write best params to `best_params`
    best_params = grid.best_params_


    column_transformer = ColumnTransformer(  # OHE for cat, Scaler for real
        transformers=[
            ('real', StandardScaler(), real_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)

        ], n_jobs=4
    )

    pipeline = Pipeline(  # column transformer and then model with `best_params` as params
        steps=[
            ('column_transformer', column_transformer),
            ('model', estimator)

        ]
    )

    pipeline.set_params(**best_params)
    pipeline.fit(x_train, y_train)

    score_train = mape(y_train, pipeline.predict(x_train))
    score_test = mape(y_test, pipeline.predict(x_test))

    return score_train, score_test, best_params

In [None]:
param_grid = {
    'model__n_estimators': [10],
    'model__learning_rate': [0.01, 0.1, 0.3, 0.5],
    'model__min_samples_split': [2, 12, 500],
    'model__max_depth': [5, 9, 12],
    'model__subsample' : [0.5, 0.8, 1],
}

hyperopt(GradientBoostingRegressor(), param_grid)

## Sample 2

In [None]:
# Трансформеры данных
numeric_transformer = Pipeline(steps=[
    # ('imputer', SimpleImputer(strategy='median')), - можно было бы применить импьютер для заполнения пропусков
    ('scaler', StandardScaler())])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Предобработка данных
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)])

# Основной pipeline
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))])

# Отобразить pipeline
set_config(display='diagram')
pipe

In [None]:
# Параметры модели и преодобработки данных
parameters = {
    'preprocessor__num__scaler': [StandardScaler(), RobustScaler()],
    'classifier__C': [100, 10, 1, 0.1, 0.01, 0.001]
}

In [None]:
# Подбор параметров
X_train, y_train = data[num_cols + cat_cols], data[target_col]
cross_val = StratifiedShuffleSplit(n_splits=5,test_size=0.3,random_state=42)

grid = GridSearchCV(pipe, parameters, cv=cross_val, scoring='roc_auc').fit(X_train, y_train)

In [None]:
print("Лучшие параметры:", grid.best_params_)
print("Лучший score:", grid.best_score_)

In [None]:
# Модель
model = CatBoostClassifier(iterations=100, # количество деревьев уменьшено до 100 для ускорения расчетов
                           cat_features=cat_cols, # категориальные фичи
                           eval_metric='AUC:hints=skip_train~false', # метрика
                           verbose=False) # вывод инфорамации

# Сетка для подбора параметров
grid = {'learning_rate': [0.01, 0.03, 0.05, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

# Поиск лучших параметров
grid_search_result = model.grid_search(grid,
                                       X=X_train,
                                       y=y_train,
                                       cv=5,
                                       refit=True,
                                       shuffle=True,
                                       stratified=True,
                                       verbose=False,
                                       plot=True)

In [None]:
# CatBoost с лучшими параметрами
best_boost = CatBoostClassifier(iterations=100,
                                cat_features=cat_cols,
                                custom_metric='AUC:hints=skip_train~false',
                                verbose=False,
                                **grid_search_result['params'])

best_boost.fit(X_train, y_train)

In [None]:
# Результаты кросс-валидации
pd.DataFrame(grid_search_result['cv_results'])

In [None]:
print("Лучшие параметры:", grid_search_result['params'])
print("Лучший score:", 0.8740685544)