In [137]:
from _py.config import config

from copy import deepcopy
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
import lightgbm as lgbm
import catboost as cb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, RocCurveDisplay
from sklearn.base import BaseEstimator, TransformerMixin

In [6]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, codes=False):
        self.codes = codes

    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        X_out = X.copy()
        cat_cols = X_out.select_dtypes(include=['object', 'category', 'bool']).columns.to_list()
        for col in cat_cols:
            if self.codes:
                X_out[col] = X_out[col].astype('category').cat.codes
            else:
                 X_out[col] = X_out[col].astype('category')
                
        return X_out

In [42]:
if re.search('CAT', 'XGB'.upper()):
    print(True)
else:
    print(False)

False


In [117]:
None or 2

2

In [121]:
class Trainer():
    def __init__(self, models=None, meta_model=None, random_state=None):
        self.models = deepcopy(models)
        self.OOF_PRED = pd.DataFrame()
        self.OOF_PROBA = pd.DataFrame()
        self.meta_model = meta_model or LogisticRegression()
        
        self.random_state = random_state
                            
    def train(self, X, y):
        self.models_ = []
        folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=self.random_state)
        
        for spec in self.models:
            oof_pred_proba = np.zeros(X.shape[0], dtype=float)
            oof_pred = np.zeros(X.shape[0], dtype=int)
            for n_fold, (train_id, valid_id) in enumerate(folds.split(X, y)):
                X_train = X.iloc[train_id].copy()
                y_train = y.iloc[train_id]
                
                X_valid = X.iloc[valid_id].copy()
                y_valid = y.iloc[valid_id]
                
                # if spec.preprocessing:
                #     X_train_prep = spec.preprocessing.fit_transform()
                #     X_valid_prep = spec.preprocessing.transform()
                    
                if re.search('CAT', spec.name.upper()):
                    pool = cb.Pool(X_train, y_train, cat_featurs=spec.cat_features)
                    spec.model = cb.CatBoostClassifier(**spec.params)
                    spec.model.fit(pool)
                    
                    y_pred_proba = spec.model.predict_proba(X_valid)[:, 1]
                    y_pred = spec.model.predict(X_valid)
                elif re.search('XGB', spec.name.upper()):
                    dtrain = xgb.DMatrix(X_train, y_train, enable_categorical=enable_categorical)
                    dvalid = xgb.DMatrix(X_valid, y_valid, enable_categorical=enable_categorical)
                    
                    num_boost_round = spec.params.pop('n_estimators', 500)
                    spec.model = xgb.train(spec.params, dtrain, num_boost_round=num_boost_round)
                    
                    y_pred_proba = spec.model.predict(dvalid)
                    y_pred = (y_pred_proba > 0.5).astype(int)
                elif re.search('LGB', spec.name.upper()):
                    train_dataset = lgbm.Dataset(X_train, y_train, categorical_feature=spec.cat_features)
                    
                    num_boost_round = spec.params.pop('n_estimators', 500)
                    spec.model = lgbm.train(spec.params, train_dataset, num_boost_round=num_boost_round)
                    
                    y_pred_proba = model.predict(X_valid)
                    y_pred = (y_pred_proba > 0.5).astype(int)
                else:
                    spec.model().set_params(**spec.params)
                    model.fit(X_train, y_train)

                    y_pred_proba = model.predict_proba(X_valid)[:, 1]
                    y_pred = model.predict(X_valid)
                    
                oof_pred_proba[valid_id] = y_pred_proba
                oof_pred[valid_id] = y_pred

            # spec.preprocessing.fit_transform(train)
            # spec.preprocessing.transform(train)
            # model.fit(train, target)
            # self.models_.append(model)
            
            self.OOF_PRED[model_name] = oof_pred
            self.OOF_PROBA[model_name] = oof_pred_proba


    def train_catboost(self):
        pass
    def train_xgboost(self):
        pass
    def train_lightgbm(self):
        pass
        
    def predict(self, X):
        y_pred_proba = self.predict_proba(X)
        return (y_pred_proba > 0.5).astype(int)

    def predict_proba(self, X):
        pass
        
    def plot_curve(self):
        pass

-----

In [9]:
train = pd.read_sql('SELECT*FROM train', config.engine, index_col='id')
test = pd.read_sql('SELECT*FROM test', config.engine, index_col='id')

target = train[config.target_name]
train = train.drop(config.target_name, axis=1)

In [None]:
base_all_features = train.columns.to_list()
base_cat_features = train.select_dtypes(include=['object', 'bool', 'category']).columns.to_list()
base_num_features = train.select_dtypes(include='number').columns.to_list()

In [120]:
class ModelSpec:
    def __init__(self, name, model=None, params=None, 
                 all_features=None, cat_features=None, num_features=None,
                 preprocessing=None):
        
        self.name = name
        self.model = model
        self.params = params
        
        self.all_features = all_features
        self.cat_featurs = cat_features
        self.num_features = num_features
        
        self.preprocessing = preprocessing

In [84]:
params_xgb = {
    'device': 'gpu',
    'objective': 'binary:logistic',
    'learning_rate': 0.1,
    'tree_method': 'hist',
    'n_estimators': 500
}
params_cat = {
    "task_type": "GPU",
    "loss_function": "Logloss",
    "iterations": 500,
    "verbose": False}
params_lgbm = {
    "device": "gpu",
    "objective": "binary",
    "boosting_type": "gbdt",
    'learning_rate': 0.1,
    "verbosity": -1
}
params_logreg = {
    'penalty': 'l1',
    'solver': 'liblinear',
    'C': 1.0,
    'max_iter': 1_000_000
}

In [144]:
LogisticRegression().set_params(**params_logreg)

In [98]:
models = \
[
    ModelSpec('XGB', params=params_xgb),
    ModelSpec('CAT', params=params_cat),
    ModelSpec('LGBM', params=params_lgbm),
    ModelSpec('LOGREG', LogisticRegression, params=params_logreg)
]

In [99]:
sp = ModelSpec('CAT', params=params_cat)

In [102]:
sp.model = cb.CatBoostRegressor(**sp.params)

In [104]:
sp.model.get_params()

{'iterations': 500,
 'loss_function': 'Logloss',
 'verbose': False,
 'task_type': 'GPU'}

In [138]:
sp.preprocessing

In [141]:
sp.model = xgb.XGBClassifier()

In [142]:
sp.model