In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import precision_recall_curve, roc_auc_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

### Урок 3. Связь бизнес-показателей и DS-метрик

In [2]:
CAT_FEATURE_NAMES = {
    'gender',
    'cholesterol',    
}

In [3]:
df = pd.read_csv('train_case2.csv', ';')
df.head(3)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('cardio', 1), 
    df['cardio'], 
    random_state=0)

In [5]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]


In [6]:
continuos_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
cat_cols = ['gender', 'cholesterol']
base_cols = ['gluc', 'smoke', 'alco', 'active']

continuos_transformers = []
cat_transformers = []
base_transformers = []

for cont_col in continuos_cols:
    transfomer =  Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('standard', StandardScaler())
            ])
    continuos_transformers.append((cont_col, transfomer))
    
for cat_col in cat_cols:
    cat_transformer = Pipeline([
                ('selector', ColumnSelector(key=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    cat_transformers.append((cat_col, cat_transformer))
    
for base_col in base_cols:
    base_transformer = Pipeline([
                ('selector', NumberSelector(key=base_col))
            ])
    base_transformers.append((base_col, base_transformer))

In [7]:
feats = FeatureUnion(continuos_transformers + cat_transformers + base_transformers)
feature_processing = Pipeline([('feats', feats)])
feature_processing.fit_transform(X_train)

array([[-1.73391771,  0.6873301 ,  0.74843904, ...,  1.        ,
         0.        ,  1.        ],
       [-1.67343538,  0.07758923, -0.29640123, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.13738132,  1.17512278, -0.15708919, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.17775864,  1.17512278, -0.15708919, ...,  0.        ,
         0.        ,  1.        ],
       [-0.47190715, -1.38578883,  0.74843904, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.38174619,  0.56538192, -0.08743318, ...,  0.        ,
         0.        ,  1.        ]])

In [8]:
# Создаем список классификационных моделей, включая предобработкку данных

classifiers = {
    'logistic regression': Pipeline([
        ('features', feats), 
        ('classifier', LogisticRegression(random_state = 42))]),
    'random forest': Pipeline([
        ('features', feats), 
        ('classifier', RandomForestClassifier(max_depth=10, n_estimators=100, random_state=42, criterion='entropy'))]),
    'cat boost': Pipeline([
        ('features', feats), 
        ('classifier', CatBoostClassifier(random_state = 42, silent=True, eval_metric='AUC'))]),
    'xgb boost': Pipeline([
        ('features', feats), 
        ('classifier', XGBClassifier(random_state = 42))])
}

In [9]:
# Сформируем структуру таблицы со статистикой

valid_cols = ['mean roc auc', 'roc auc std']

test_cols = ['threshold',
        'precision',
        'recall',
        'fscore',
        'roc auc']

cols_names = [('validation', c) for c in valid_cols] + [('test', c) for c in test_cols]

multi_cols = pd.MultiIndex.from_tuples(cols_names)
stat_df = pd.DataFrame(columns=multi_cols, index=classifiers.keys())

In [10]:
for (name, classifier) in classifiers.items():
    #запустим кросс-валидацию
    cv_scores = cross_val_score(classifier, X_train, y_train, cv=16, scoring='roc_auc')
    cv_score = np.mean(cv_scores)
    cv_score_std = np.std(cv_scores)

    #обучим пайплайн на всем тренировочном датасете и предскажем значения тестовой выборки
    classifier.fit(X_train, y_train)
    y_score = classifier.predict_proba(X_test)[:, 1]
    
    b = 1
    precision, recall, thresholds = precision_recall_curve(y_test.values, y_score)
    fscore = (1 + b ** 2) * (precision * recall) / (b ** 2 * precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)    
    roc_auc = roc_auc_score(y_test.values, y_score)
    
    stat_df.at[name, ('validation', 'mean roc auc')] = cv_score
    stat_df.at[name, ('validation', 'roc auc std')] = cv_score_std
    stat_df.at[name, ('test', 'threshold')] = thresholds[ix]
    stat_df.at[name, ('test', 'precision')] = precision[ix]
    stat_df.at[name, ('test', 'recall')] = recall[ix]
    stat_df.at[name, ('test', 'fscore')] = fscore[ix]
    stat_df.at[name, ('test', 'roc auc')] = roc_auc       
    
stat_df

Unnamed: 0_level_0,validation,validation,test,test,test,test,test
Unnamed: 0_level_1,mean roc auc,roc auc std,threshold,precision,recall,fscore,roc auc
logistic regression,0.78674,0.00852136,0.386937,0.647431,0.837558,0.730323,0.784035
random forest,0.801695,0.00687063,0.364553,0.666975,0.830876,0.739958,0.801382
cat boost,0.802071,0.00698117,0.386047,0.691275,0.795046,0.739538,0.80108
xgb boost,0.797488,0.00644788,0.347103,0.665248,0.828341,0.73789,0.797228
