## Обучение модели

Возьмем датасет бинарной классификации, представляющий собой электронную медицинискую карту пациентов с результатами лабораторных анализов, для определения дальнейшего хода лечения (требуется дальнейший уход за пациентом или нет).

In [1]:
import numpy as np
import pandas as pd
import dill

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve

In [2]:
df = pd.read_csv('data-ori.csv')

In [3]:
df.head()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE,SEX,SOURCE
0,35.1,11.8,4.65,6.3,310,25.4,33.6,75.5,1,F,out
1,43.5,14.8,5.39,12.7,334,27.5,34.0,80.7,1,F,out
2,33.5,11.3,4.74,13.2,305,23.8,33.7,70.7,1,F,out
3,39.1,13.7,4.98,10.5,366,27.5,35.0,78.5,1,F,out
4,30.9,9.9,4.23,22.1,333,23.4,32.0,73.0,1,M,out


In [4]:
df.describe()

Unnamed: 0,HAEMATOCRIT,HAEMOGLOBINS,ERYTHROCYTE,LEUCOCYTE,THROMBOCYTE,MCH,MCHC,MCV,AGE
count,4412.0,4412.0,4412.0,4412.0,4412.0,4412.0,4412.0,4412.0,4412.0
mean,38.197688,12.741727,4.54126,8.718608,257.524479,28.234701,33.343042,84.612942,46.626473
std,5.974784,2.079903,0.784091,5.049041,113.972365,2.672639,1.228664,6.859101,21.731218
min,13.7,3.8,1.48,1.1,8.0,14.9,26.0,54.0,1.0
25%,34.375,11.4,4.04,5.675,188.0,27.2,32.7,81.5,29.0
50%,38.6,12.9,4.57,7.6,256.0,28.7,33.4,85.4,47.0
75%,42.5,14.2,5.05,10.3,321.0,29.8,34.1,88.7,64.0
max,69.0,18.9,7.86,76.6,1183.0,40.8,39.0,115.6,99.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4412 entries, 0 to 4411
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HAEMATOCRIT   4412 non-null   float64
 1   HAEMOGLOBINS  4412 non-null   float64
 2   ERYTHROCYTE   4412 non-null   float64
 3   LEUCOCYTE     4412 non-null   float64
 4   THROMBOCYTE   4412 non-null   int64  
 5   MCH           4412 non-null   float64
 6   MCHC          4412 non-null   float64
 7   MCV           4412 non-null   float64
 8   AGE           4412 non-null   int64  
 9   SEX           4412 non-null   object 
 10  SOURCE        4412 non-null   object 
dtypes: float64(7), int64(2), object(2)
memory usage: 379.3+ KB


In [6]:
df['SOURCE'].value_counts()

out    2628
in     1784
Name: SOURCE, dtype: int64

- Соберем пайплайн для признаков.

In [7]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class Encoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        #self.columns = []

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        copy = X.copy()
        copy[self.key] = np.where(copy[self.key] == 'M', 1, 0).astype('i1')
        return copy

In [8]:
transformers = []

for cont_col in df.columns.drop(['SEX', 'SOURCE']):
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    transformers.append((cont_col, cont_transformer))

transformers.append(('SEX', Pipeline([('selector', NumberSelector(key='SEX')),
                                         ('encoder', Encoder(key='SEX'))])
                    ))

feats = FeatureUnion(transformers)

- Заменим значения целевого признака на 0 и 1. Далее разобьём датасет на train и test и сохраним их на диск.

In [9]:
df['SOURCE'] = np.where(df['SOURCE'] == 'in', 1, 0).astype('i1')

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['SOURCE'], axis=1), df['SOURCE'], random_state=23)

X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

- Построим, обучим и сохраним модель бустинга.

In [11]:
%%time

xgb = Pipeline([('features', feats),
                ('classifier', XGBClassifier(
                    n_estimators=100,
                    learning_rate=0.1,
                    max_depth=5,
                    use_label_encoder=False,
                    eval_metric='error',
                    random_state=23))])

xgb.fit(X_train, y_train)

Wall time: 416 ms


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('HAEMATOCRIT',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='HAEMATOCRIT'))])),
                                                ('HAEMOGLOBINS',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='HAEMOGLOBINS'))])),
                                                ('ERYTHROCYTE',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='ERYTHROCYTE'))])),
                                                ('LEUCOCYTE',
                                                 Pipeline(steps=[('selector',
                                                                  Numbe...
             

In [12]:
xgb.steps

[('features',
  FeatureUnion(transformer_list=[('HAEMATOCRIT',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='HAEMATOCRIT'))])),
                                 ('HAEMOGLOBINS',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='HAEMOGLOBINS'))])),
                                 ('ERYTHROCYTE',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='ERYTHROCYTE'))])),
                                 ('LEUCOCYTE',
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='LEUCOCYTE'))]...
                                  Pipeline(steps=[('selector',
                                                   NumberSelector(key='MCH'))])),
                                 ('

In [13]:
with open("xgb_pipeline.dill", "wb") as f:
    dill.dump(xgb, f)