## Создание и обучение пайплайна

Загрузим необходимые библиотеки

In [None]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression

#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score

Загрузим данные

In [None]:
df = pd.read_csv("cardio.csv", ";")
df.head(5)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [None]:
df.shape

(70000, 13)

In [None]:
df['cardio'].value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

Разделим данные на train/test и сохраним тестовую выборку на диск

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, df['cardio'],
                                                    test_size=0.33, random_state=42)
# save test
X_test.to_csv("X_test_cardio.csv", index=None)
y_test.to_csv("y_test_cardio.csv", index=None)

# save train
X_train.to_csv("X_train_cardio.csv", index=None)
y_train.to_csv("y_train_cardio.csv", index=None)

Напишем пайплайн:

In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [None]:
X_train.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [None]:
continuous_columns = ['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']

Соберем модуль, ответственный за feature engineering

In [None]:
final_transformers = list()

for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
            ])
    final_transformers.append((cont_col, cont_transformer))

In [None]:
final_transformers

[('age', Pipeline(steps=[('selector', NumberSelector(key='age'))])),
 ('gender', Pipeline(steps=[('selector', NumberSelector(key='gender'))])),
 ('height', Pipeline(steps=[('selector', NumberSelector(key='height'))])),
 ('weight', Pipeline(steps=[('selector', NumberSelector(key='weight'))])),
 ('ap_hi', Pipeline(steps=[('selector', NumberSelector(key='ap_hi'))])),
 ('ap_lo', Pipeline(steps=[('selector', NumberSelector(key='ap_lo'))])),
 ('cholesterol',
  Pipeline(steps=[('selector', NumberSelector(key='cholesterol'))])),
 ('gluc', Pipeline(steps=[('selector', NumberSelector(key='gluc'))])),
 ('smoke', Pipeline(steps=[('selector', NumberSelector(key='smoke'))])),
 ('alco', Pipeline(steps=[('selector', NumberSelector(key='alco'))])),
 ('active', Pipeline(steps=[('selector', NumberSelector(key='active'))]))]

In [None]:
feats = FeatureUnion(final_transformers)

Добавим классификатор

In [None]:
feats = FeatureUnion(final_transformers)

pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression())
])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('age',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='age'))])),
                                                ('gender',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='gender'))])),
                                                ('height',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='height'))])),
                                                ('weight',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='weight'))])),
                             

In [None]:
%%time

pipeline.fit(X_train, y_train)

CPU times: user 800 ms, sys: 598 ms, total: 1.4 s
Wall time: 753 ms


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('age',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='age'))])),
                                                ('gender',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='gender'))])),
                                                ('height',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='height'))])),
                                                ('weight',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='weight'))])),
                             

Посмотрим как выглядит пайплайн

In [None]:
pipeline.steps

[('features', FeatureUnion(transformer_list=[('age',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='age'))])),
                                 ('gender',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='gender'))])),
                                 ('height',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='height'))])),
                                 ('weight',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='weight'))])),
                                 ('ap_hi',
                                  Pipeline(steps=[('selector',
                                                   Numb...
                                 ('cholesterol',
            

Сохраним модель на диск

In [None]:
with open("cardio_pipeline_lr.dill", "wb") as f:
    dill.dump(pipeline, f)