In [59]:
from _py.config import config

from copy import deepcopy
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
import lightgbm as lgbm
import catboost as cb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, RocCurveDisplay
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.preprocessing import StandardScaler
from category_encoders import OrdinalEncoder, TargetEncoder, CatBoostEncoder, CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [2]:
class Spec:
    def __init__(self, name, model=None, params=None, 
                 all_features=None, cat_features=None, num_features=None,
                 preprocessing=None):
        
        self.name = name
        self.model = model
        self.params = params
        
        self.all_features = all_features
        self.cat_features = cat_features
        self.num_features = num_features
        
        self.preprocessing = preprocessing

In [3]:
class DFStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self, num_features):
        self.num_features = num_features
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        self.scaler.fit(X[self.num_features])

        self.is_fitted_ = True 
        return self

    def transform(self, X):
        X_out = X.copy()
        X_out[self.num_features] = self.scaler.transform(X[self.num_features])
        return X_out

In [4]:
class CategoricalTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cat_features=None):
        self.cat_features = cat_features
        
    def fit(self, X, y=None):
        self.cat_features_ = self.cat_features or X.select_dtypes(include=['object', 'bool'])

        self.is_fitted_ = True 
        return self
        
    def transform(self, X):
        X_out = X.copy()
        for col in self.cat_features_:
            X_out[col] = X_out[col].astype('category')
        return X_out

1) Изменить архетектуру на **evaluate**
2) Сделать дефолтную мета-модель **mean**
3) Сделать возможность добавлять модели к ансамблю
4) Пост добавления метамодели
5) Сделать возможность просмотра **auc** для всего ансамбля
6) Параметр **full_train?** *True/False* (Переобучать модели на всех данных или прогон через модели каждого фолда)
7) Изменить **Verbose** под промежуточный вывод на каждом фолде/общий/заглушон
8) **forward** отбор признаков(предиктов базовых моделей) ансамблем *либо* permutation/shap

In [72]:
class Trainer():
    def __init__(self, models=None, meta_model=None, random_state=None, verbose=False):
        self.models = deepcopy(models)
        self.OOF_PRED = pd.DataFrame()
        self.OOF_PROBA = pd.DataFrame()
        self.meta_model = meta_model or LogisticRegression(penalty='l2', solver='lbfgs', C=0.1, max_iter=10_000)
        self.random_state = random_state
        self.verbose = verbose
                            
    def train(self, X, y):
        self.y_ = y.copy()
        folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=self.random_state)
        
        for spec in self.models:
            oof_pred_proba = np.zeros(X.shape[0], dtype=float)
            oof_pred = np.zeros(X.shape[0], dtype=int)

            num_boost_round = spec.params.get('n_estimators', 500)
            params = spec.params.copy()
            params.pop('n_estimators', None)

            for n_fold, (train_id, valid_id) in enumerate(folds.split(X, y)):
                X_train = X.iloc[train_id].copy()
                y_train = y.iloc[train_id]
                X_valid = X.iloc[valid_id].copy()
                y_valid = y.iloc[valid_id]
                
                if spec.preprocessing:
                    prep = clone(spec.preprocessing)
                    prep.fit(X_train, y_train)
                    X_train = prep.transform(X_train)
                    X_valid = prep.transform(X_valid)
                    
                if re.search('CAT', spec.name.upper()):
                    pool = cb.Pool(X_train, y_train, cat_features=spec.cat_features)
                    model = cb.CatBoostClassifier(**params)
                    model.fit(pool)
                    y_pred_proba = model.predict_proba(X_valid)[:, 1]
                    y_pred = model.predict(X_valid)

                elif re.search('XGB', spec.name.upper()):
                    dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=True)
                    dvalid = xgb.DMatrix(X_valid, y_valid, enable_categorical=True)
                    model = xgb.train(params, dtrain, num_boost_round=num_boost_round)
                    y_pred_proba = model.predict(dvalid)
                    y_pred = (y_pred_proba > 0.5).astype(int)

                elif re.search('LGB', spec.name.upper()):
                    train_dataset = lgbm.Dataset(X_train, y_train, categorical_feature=spec.cat_features)
                    model = lgbm.train(params, train_dataset, num_boost_round=num_boost_round)
                    y_pred_proba = model.predict(X_valid)
                    y_pred = (y_pred_proba > 0.5).astype(int)

                else:
                    model = spec.model(**params)
                    model.fit(X_train, y_train)
                    y_pred_proba = model.predict_proba(X_valid)[:, 1]
                    y_pred = model.predict(X_valid)
                    
                oof_pred_proba[valid_id] = y_pred_proba
                oof_pred[valid_id] = y_pred
                
                # Промежуточный Verbose
                if self.verbose:
                    acc_fold = accuracy_score(y_valid, y_pred)
                    auc_fold = roc_auc_score(y_valid, y_pred_proba)
                    print(f'{spec.name} Model fold_{n_fold} | auc: {auc_fold:.5f} | accuracy: {acc_fold:.5f} ')
            
            self.OOF_PRED[spec.name] = oof_pred
            self.OOF_PROBA[spec.name] = oof_pred_proba
            
            if self.verbose:
                acc = accuracy_score(y, oof_pred)
                auc = roc_auc_score(y, oof_pred_proba)
                print(f'{spec.name} Model OOF | auc: {auc:.5f} | accuracy: {acc:.5f} ')
                print('----------------------------------------------------')
                
            # финальное обучение на всех данных
            if spec.preprocessing:
                spec.preprocessing.fit(X, y)
                X_prep = spec.preprocessing.transform(X)
            else:
                X_prep = X.copy()
                
            if re.search('XGB', spec.name.upper()):
                dfull = xgb.DMatrix(X_prep, y, enable_categorical=True)
                spec.model = xgb.train(params, dfull, num_boost_round=num_boost_round)

            elif re.search('LGB', spec.name.upper()):
                dfull = lgbm.Dataset(X_prep, y, categorical_feature=spec.cat_features)
                spec.model = lgbm.train(params, dfull, num_boost_round=num_boost_round)

            elif re.search('CAT', spec.name.upper()):
                poolfull = cb.Pool(X_prep, y, cat_features=spec.cat_features)
                spec.model = cb.CatBoostClassifier(**params)
                spec.model.fit(poolfull)

            else:
                spec.model = spec.model(**params)
                spec.model.fit(X_prep, y)
                
        self.meta_model.fit(self.OOF_PRED, y)
                
    def predict(self, X):
        y_pred_proba = self.predict_proba(X)
        return (y_pred_proba > 0.5).astype(int)

    def predict_proba(self, X):
        self.OOF_PROBA_ = pd.DataFrame()
        for spec in self.models:
            if spec.preprocessing:
                X_prep = spec.preprocessing.transform(X)
            else:
                X_prep = X.copy()
                
            if re.search('CAT', spec.name.upper()):
                self.OOF_PROBA_[spec.name] = spec.model.predict_proba(X_prep)[:, 1]
            elif re.search('XGB', spec.name.upper()):
                dmatrix = xgb.DMatrix(X_prep, enable_categorical=True)
                self.OOF_PROBA_[spec.name] = spec.model.predict(dmatrix)
            elif re.search('LGB', spec.name.upper()):
                self.OOF_PROBA_[spec.name] = spec.model.predict(X_prep)
            else:
                self.OOF_PROBA_[spec.name] = spec.model.predict_proba(X_prep)[:, 1]
        return self.meta_model.predict_proba(self.OOF_PROBA_)[:, 1]

    def plot_curve(self):
        fig, ax = plt.subplots(1, 1, figsize=(7, 6))
        for col in self.OOF_PROBA.columns.to_list():
            RocCurveDisplay.from_predictions(self.y_, self.OOF_PROBA[col], ax=ax, name=col)
        ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='black')
        plt.show()

-----

In [6]:
train = pd.read_sql('SELECT*FROM train', config.engine, index_col='id')
test = pd.read_sql('SELECT*FROM test', config.engine, index_col='id')

target = train[config.target_name]
train = train.drop(config.target_name, axis=1)

In [7]:
base_all_features = train.columns.to_list()
base_cat_features = train.select_dtypes(include=['object', 'bool', 'category']).columns.to_list()
base_num_features = train.select_dtypes(include='number').columns.to_list()

---------

In [39]:
params_xgb = {
    'device': 'gpu',
    'objective': 'binary:logistic',
    'learning_rate': 0.1,
    'tree_method': 'hist',
}
params_cat = {
    "task_type": "GPU",
    'iterations': 500,
    "loss_function": "Logloss",
    "verbose": False}
params_lgbm = {
    "device": "gpu",
    "objective": "binary",
    "boosting_type": "gbdt",
    'learning_rate': 0.1,
    'n_estimators': 500,
    "verbosity": -1
}
params_logreg = {
    'penalty': 'l1',
    'solver': 'liblinear',
    'C': 0.05,
    'max_iter': 1_000_000
}

In [40]:
order = ['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']
values = [0, 1, 2, 3, 4]
other_cols = np.array(base_cat_features)[base_cat_features!='income_level'].reshape(-1)

income_level_enc = OrdinalEncoder(mapping=[{'col': 'income_level',
                                           'mapping': dict(zip(order, values))}], 
                                 cols=['income_level'])

encoder_pipe = Pipeline([
    ('income_level_enc', income_level_enc),
    ('other_cols_enc', OrdinalEncoder(cols=other_cols))
])

preprocessor_logreg = Pipeline([
    ('scaler', DFStandardScaler(num_features=base_num_features)),
    ('encoder', encoder_pipe)
])


In [61]:
spec_params = {'cat_features': base_cat_features, 
               'all_features': base_all_features, 
               'num_features': base_num_features}
models = \
[
    Spec('XGB', params=params_xgb, preprocessing=CategoricalTransformer(), **spec_params),
    Spec('LGBM', params=params_lgbm, preprocessing=CategoricalTransformer(), **spec_params),
    Spec('CAT', params=params_cat, **spec_params),
    # Spec('LogReg', model=LogisticRegression, params=params_logreg, preprocessing=preprocessor_logreg, **spec_params)
]

In [62]:
T = Trainer(models=models, verbose=True)

In [63]:
T.train(train, target)

XGB Model fold_0 | auc: 0.72297 | accuracy: 0.68217 
XGB Model fold_1 | auc: 0.72420 | accuracy: 0.68444 
XGB Model fold_2 | auc: 0.72974 | accuracy: 0.68684 
XGB Model fold_3 | auc: 0.72616 | accuracy: 0.68534 
XGB Model fold_4 | auc: 0.72263 | accuracy: 0.68177 
XGB Model fold_5 | auc: 0.72551 | accuracy: 0.68293 
XGB Model fold_6 | auc: 0.72822 | accuracy: 0.68481 
XGB Model fold_7 | auc: 0.72541 | accuracy: 0.68299 
XGB Model fold_8 | auc: 0.72262 | accuracy: 0.68199 
XGB Model fold_9 | auc: 0.72681 | accuracy: 0.68317 
XGB Model OOF | auc: 0.72542 | accuracy: 0.68365 
----------------------------------------------------
LGBM Model fold_0 | auc: 0.72595 | accuracy: 0.68420 
LGBM Model fold_1 | auc: 0.72648 | accuracy: 0.68623 
LGBM Model fold_2 | auc: 0.72834 | accuracy: 0.68483 
LGBM Model fold_3 | auc: 0.72808 | accuracy: 0.68577 
LGBM Model fold_4 | auc: 0.72825 | accuracy: 0.68441 
LGBM Model fold_5 | auc: 0.72755 | accuracy: 0.68571 
LGBM Model fold_6 | auc: 0.72934 | accuracy

--------

In [64]:
T.OOF_PROBA

Unnamed: 0,XGB,LGBM,CAT
0,0.489324,0.438067,0.488849
1,0.597435,0.592536,0.602475
2,0.201323,0.203729,0.268082
3,0.507546,0.504898,0.555197
4,0.803559,0.794958,0.731024
...,...,...,...
699995,0.348816,0.354387,0.397244
699996,0.596341,0.626147,0.598142
699997,0.620856,0.657386,0.649272
699998,0.615170,0.623607,0.615962


------

In [47]:
def create_submission(y_preds, file_name):
    submission = pd.read_sql('SELECT * FROM sample_submission', config.engine)
    submission[config.target_name] = y_preds
    path = config.path_submissions+'/'+file_name+'.csv'
    
    submission.to_csv(path, index=False)

In [65]:
y_preds = T.predict_proba(test)
create_submission(y_preds, 'ensemble_0')

In [70]:
y_preds_2 = T.OOF_PROBA_.mean(axis=1).values
create_submission(y_preds_2, 'ensemble_1')