In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install -U -q automl-alex

In [1]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

import automl_alex
from automl_alex import LightGBMClassifier, DataBunch

print(automl_alex.__version__)

1.02.15


In [2]:
RANDOM_SEED = 42

# Load Data

In [3]:
dataset = fetch_openml(name='adult', version=1, as_frame=True)
# convert target to binary
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [50]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.2, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((39073, 14), (9769, 14))

# Data Cleaning (DataBunch)

In [52]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.preprocessing import StandardScaler

In [53]:
class CleanNans(object):
    """
    Сlass for cleaning Nans
    """

    def __init__(self, method='median'):
        """
        Fill Nans and add column, that there were nans in this column
        
        Args:
            method : {'median', 'mean',}
        """
        self.method = method

    def fit(self, data, cols=None):
        """
        Fit fillna.

        Args:
            data (pd.DataFrame, shape (n_samples, n_features)): the input data
            cols list() features: the input data
        Returns:
            self
        """
        if cols is not None:
            data = data[cols]
        
        self.nan_columns = list(data.columns[data.isnull().sum() > 0])
        if not self.nan_columns:     
            print('No nans features')

        if self.method is 'median':
            self.fill_value = data.median()
        elif self.method is 'mean':
            self.fill_value = data.mean()
        else:
            raise ValueError('Wrong fill method')
        
        return self

    def transform(self, data, cols=None):
        """Transforms the dataset.
        Args:
            data (pd.DataFrame, shape (n_samples, n_features)): the input data
            cols list() features: the input data
        Returns:
            pandas.Dataframe of shape = (n_train, n_features)
                The train dataset with no missing values.
        """
        if cols is not None:
            data = data[cols]

        if self.nan_columns:
            for nan_column in self.nan_columns:
                data[nan_column+'_isNAN'] = pd.isna(data[nan_column]).astype('uint8')
            
            data.fillna(self.fill_value, inplace=True)
        else:
            raise ValueError('No nans features')

        return data

    def fit_transform(self, data, cols=None):
        """Fit and transforms the dataset.
        Args:
            data (pd.DataFrame, shape (n_samples, n_features)): the input data
            cols list() features: the input data
        Returns:
            pandas.Dataframe of shape = (n_train, n_features)
                The train dataset with no missing values.
        """
        self.fit(data, cols)

        return self.transform(data)

In [54]:
cn = CleanNans()

In [55]:
cn = cn.fit(X_train)

In [57]:
X_train = cn.transform(X_train)

In [59]:
X_test = cn.transform(X_test)

In [61]:
X_test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,workclass_isNAN,occupation_isNAN,native-country_isNAN
7762,0,Private,423024.0,HS-grad,9.0,Never-married,Other-service,Not-in-family,White,Male,0,0,0,United-States,0,0,0
23881,0,Private,178953.0,12th,8.0,Never-married,Sales,Own-child,White,Female,0,0,0,United-States,0,0,0
30507,0,Local-gov,348986.0,HS-grad,9.0,Never-married,Handlers-cleaners,Other-relative,Black,Male,0,0,2,United-States,0,0,0
28911,0,Private,218215.0,Some-college,10.0,Never-married,Sales,Own-child,White,Female,0,0,1,United-States,0,0,0
19484,3,Private,244025.0,HS-grad,9.0,Never-married,Machine-op-inspct,Unmarried,Amer-Indian-Eskimo,Male,0,0,3,Puerto-Rico,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43046,4,Self-emp-not-inc,118614.0,Masters,14.0,Separated,Sales,Unmarried,White,Female,0,0,2,United-States,0,0,0
18798,0,Private,205838.0,HS-grad,9.0,Never-married,Other-service,Own-child,White,Male,0,0,2,United-States,0,0,0
29519,2,Private,194304.0,Some-college,10.0,Divorced,Transport-moving,Not-in-family,Black,Male,0,0,3,United-States,0,0,0
550,3,Self-emp-not-inc,245724.0,Some-college,10.0,Divorced,Exec-managerial,Not-in-family,White,Male,0,0,3,United-States,0,0,0


In [62]:
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country,workclass_isNAN,occupation_isNAN,native-country_isNAN
37193,1,Private,50753.0,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,2,United-States,0,0,0
31093,2,State-gov,144351.0,Masters,14.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,2,United-States,0,0,0
33814,1,Local-gov,252217.0,12th,8.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,2,United-States,0,0,0
14500,4,Private,69525.0,HS-grad,9.0,Divorced,Craft-repair,Unmarried,White,Male,0,0,0,United-States,0,0,0
23399,4,Self-emp-not-inc,28612.0,HS-grad,9.0,Widowed,Sales,Not-in-family,White,Male,0,0,4,United-States,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,1,Private,200117.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,2,3,,0,0,1
44732,0,Private,90896.0,HS-grad,9.0,Never-married,Machine-op-inspct,Own-child,White,Female,0,0,2,United-States,0,0,0
38158,0,Private,370057.0,HS-grad,9.0,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,2,United-States,0,0,0
860,0,Private,216284.0,11th,7.0,Never-married,Adm-clerical,Own-child,White,Female,0,0,0,United-States,0,0,0


In [5]:
X_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
37193,1,Private,50753.0,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,2,United-States
31093,2,State-gov,144351.0,Masters,14.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,2,United-States
33814,1,Local-gov,252217.0,12th,8.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,2,United-States
14500,4,Private,69525.0,HS-grad,9.0,Divorced,Craft-repair,Unmarried,White,Male,0,0,0,United-States
23399,4,Self-emp-not-inc,28612.0,HS-grad,9.0,Widowed,Sales,Not-in-family,White,Male,0,0,4,United-States


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 37193 to 15795
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             39073 non-null  category
 1   workclass       36851 non-null  category
 2   fnlwgt          39073 non-null  float64 
 3   education       39073 non-null  category
 4   education-num   39073 non-null  float64 
 5   marital-status  39073 non-null  category
 6   occupation      36842 non-null  category
 7   relationship    39073 non-null  category
 8   race            39073 non-null  category
 9   sex             39073 non-null  category
 10  capitalgain     39073 non-null  category
 11  capitalloss     39073 non-null  category
 12  hoursperweek    39073 non-null  category
 13  native-country  38396 non-null  category
dtypes: category(12), float64(2)
memory usage: 1.3 MB


As we can see, the data is quite dirty, there are object/category features and nans. But the **model is successfully trained even in such a dirty dataset**

[RUS] Как мы видим, данные довольно грязные, есть object/category признаки и nans. Но модель успешно обучаеться даже таком грязном датасете:

In [7]:
model = LightGBMClassifier(X_train, y_train, X_test, random_state=RANDOM_SEED)

In [8]:
# fit with default model parameters
predicts = model.predict()
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts['predict_test'][0]),4))

100%|██████████| 1/1 [02:13<00:00, 133.73s/it]
 Mean Score roc_auc_score on 20 Folds: 0.9142 std: 0.00485
Test AUC:  0.9119



**How is this possible?**      
[RUS] как это возможно?

<img src="./img/magic.gif" width="400">

## DataBunch
before entering the model, the data goes through a full cycle of pre-processing in DataBunch     
[RUS] до того как попасть в модель, данные проходят полный цикл предобработки.

In [9]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=['OneHotEncoder',], # Encoders list for Generator cat encodet features
                clean_nan=True, # fillnan
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=False,
                normalization=False,
                cat_features=None, # DataBunch can auto detect categorical features
                random_state=RANDOM_SEED)

Source X_train shape:  (39073, 14) | X_test shape:  (9769, 14)
##################################################
Auto detect cat features:  12
> Start preprocessing Data
> Generate cat encodet features
 +  121  Features from  OneHotEncoder
> Clean Nans in num features
##################################################
> Total Features:  122
##################################################
New X_train shape:  (39073, 122) | X_test shape:  (9769, 122)


In [10]:
data.X_train.head(5)

Unnamed: 0,fnlwgt,OneHotEncoder_relationship_1,OneHotEncoder_relationship_2,OneHotEncoder_relationship_3,OneHotEncoder_relationship_4,OneHotEncoder_relationship_5,OneHotEncoder_relationship_6,OneHotEncoder_race_1,OneHotEncoder_race_2,OneHotEncoder_race_3,...,OneHotEncoder_education_14,OneHotEncoder_education_15,OneHotEncoder_education_16,OneHotEncoder_marital-status_1,OneHotEncoder_marital-status_2,OneHotEncoder_marital-status_3,OneHotEncoder_marital-status_4,OneHotEncoder_marital-status_5,OneHotEncoder_marital-status_6,OneHotEncoder_marital-status_7
0,50753.0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,144351.0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,252217.0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,69525.0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,28612.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


# Feature Engineering

## Categorical Features 
### Encoders

In [11]:
# available Encoders:
automl_alex.encoders.cat_encoders_names

{'HashingEncoder': category_encoders.hashing.HashingEncoder,
 'SumEncoder': category_encoders.sum_coding.SumEncoder,
 'PolynomialEncoder': category_encoders.polynomial.PolynomialEncoder,
 'BackwardDifferenceEncoder': category_encoders.backward_difference.BackwardDifferenceEncoder,
 'OneHotEncoder': category_encoders.one_hot.OneHotEncoder,
 'HelmertEncoder': category_encoders.helmert.HelmertEncoder,
 'OrdinalEncoder': category_encoders.ordinal.OrdinalEncoder,
 'FrequencyEncoder': automl_alex.encoders.FrequencyEncoder,
 'BaseNEncoder': category_encoders.basen.BaseNEncoder}

In [12]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=['OrdinalEncoder', 'FrequencyEncoder',], # you can choose any encoders
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=False,
                normalization=False,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:  (39073, 14) | X_test shape:  (9769, 14)
##################################################
Auto detect cat features:  12
> Start preprocessing Data
> Generate cat encodet features
 +  13  Features from  OrdinalEncoder
 +  13  Features from  FrequencyEncoder
> Clean Nans in num features
##################################################
> Total Features:  27
##################################################
New X_train shape:  (39073, 27) | X_test shape:  (9769, 27)


Unnamed: 0,fnlwgt,OrdinalEncoder_relationship,OrdinalEncoder_race,OrdinalEncoder_hoursperweek,OrdinalEncoder_workclass,OrdinalEncoder_occupation,OrdinalEncoder_capitalloss,OrdinalEncoder_native-country,OrdinalEncoder_sex,OrdinalEncoder_age,...,FrequencyEncoder_workclass,FrequencyEncoder_occupation,FrequencyEncoder_capitalloss,FrequencyEncoder_native-country,FrequencyEncoder_sex,FrequencyEncoder_age,FrequencyEncoder_education-num,FrequencyEncoder_capitalgain,FrequencyEncoder_education,FrequencyEncoder_marital-status
0,50753.0,1,1,1,1,1,1,1,1,1,...,0.694198,0.048217,0.953278,0.897424,0.668482,0.260411,0.323164,0.917387,0.323164,0.458192
1,144351.0,1,1,1,2,2,1,1,1,2,...,0.040559,0.126367,0.953278,0.897424,0.668482,0.244707,0.0544,0.917387,0.0544,0.458192
2,252217.0,1,1,1,3,3,1,1,1,1,...,0.064207,0.030507,0.953278,0.897424,0.668482,0.260411,0.013452,0.917387,0.013452,0.458192
3,69525.0,2,1,2,1,4,1,1,1,3,...,0.694198,0.125138,0.953278,0.897424,0.668482,0.127923,0.323164,0.917387,0.323164,0.135805
4,28612.0,3,1,3,4,5,1,1,1,3,...,0.079071,0.11269,0.953278,0.897424,0.668482,0.127923,0.323164,0.917387,0.323164,0.03108


### Encoding cat features by Groupby with numerical features

In [13]:
# add dataset whis more num features
dataset = fetch_openml(name='credit-g', version=1, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes

X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=RANDOM_SEED,)

In [14]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=None, # False if None
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=True, # Generator Group Encoder Features
                frequency_enc_num_features=False,
                normalization=False,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:  (800, 20) | X_test shape:  (200, 20)
##################################################
Auto detect cat features:  13
> Start preprocessing Data
> Clean Nans in num features
> Generate Group Encoder Features
 +  64  Group cat Encoder Features
##################################################
> Total Features:  68
##################################################
New X_train shape:  (800, 68) | X_test shape:  (200, 68)


Unnamed: 0,duration,credit_amount,age,num_dependents,GroupEncoder_age_installment_commitment,GroupEncoder_age_residence_since,GroupEncoder_age_other_payment_plans,GroupEncoder_age_existing_credits,GroupEncoder_age_foreign_worker,GroupEncoder_age_housing,...,GroupEncoder_num_dependents_savings_status,GroupEncoder_num_dependents_credit_history,GroupEncoder_num_dependents_job,GroupEncoder_num_dependents_property_magnitude,GroupEncoder_num_dependents_employment,GroupEncoder_num_dependents_checking_status,GroupEncoder_num_dependents_purpose,GroupEncoder_num_dependents_other_parties,GroupEncoder_num_dependents_personal_status,GroupEncoder_num_dependents_own_telephone
0,60.0,6836.0,63.0,0,2,3,2,1,0,1,...,0,3,2,3,4,0,9,0,2,1
1,21.0,2319.0,33.0,0,1,0,2,0,0,0,...,0,4,2,2,1,2,6,0,0,0
2,6.0,1236.0,50.0,0,1,3,2,0,0,0,...,2,2,2,1,2,3,1,0,2,0
3,21.0,5003.0,29.0,0,0,3,0,1,0,1,...,4,0,2,1,2,3,0,0,1,1
4,12.0,886.0,21.0,0,3,1,2,0,0,1,...,4,2,2,2,2,3,3,0,1,0


## Numerical Features

In [15]:
data.num_features_names

['age', 'credit_amount', 'duration', 'num_dependents']

### Generator interaction Num Features
Numerical interaction generator features: A/B, A*B, A-B, A+B

In [16]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=None, # False if None
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=True, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=False,
                normalization=False,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:  (800, 20) | X_test shape:  (200, 20)
##################################################
Auto detect cat features:  13
> Start preprocessing Data
> Clean Nans in num features
> Generate interaction Num Features
 +  24  Interaction Features
##################################################
> Total Features:  28
##################################################
New X_train shape:  (800, 28) | X_test shape:  (200, 28)


Unnamed: 0,duration,credit_amount,age,num_dependents,age_/_credit_amount,age_*_credit_amount,age_-_credit_amount,age_+_credit_amount,age_/_duration,age_*_duration,...,credit_amount_-_duration,credit_amount_+_duration,credit_amount_/_num_dependents,credit_amount_*_num_dependents,credit_amount_-_num_dependents,credit_amount_+_num_dependents,duration_/_num_dependents,duration_*_num_dependents,duration_-_num_dependents,duration_+_num_dependents
0,60.0,6836.0,63.0,0,0.009216,430668.0,-6773.0,6899.0,1.05,3780.0,...,6776.0,6896.0,0.0,0.0,6836.0,6836.0,0.0,0.0,60.0,60.0
1,21.0,2319.0,33.0,0,0.01423,76527.0,-2286.0,2352.0,1.571429,693.0,...,2298.0,2340.0,0.0,0.0,2319.0,2319.0,0.0,0.0,21.0,21.0
2,6.0,1236.0,50.0,0,0.040453,61800.0,-1186.0,1286.0,8.333333,300.0,...,1230.0,1242.0,0.0,0.0,1236.0,1236.0,0.0,0.0,6.0,6.0
3,21.0,5003.0,29.0,0,0.005797,145087.0,-4974.0,5032.0,1.380952,609.0,...,4982.0,5024.0,0.0,0.0,5003.0,5003.0,0.0,0.0,21.0,21.0
4,12.0,886.0,21.0,0,0.023702,18606.0,-865.0,907.0,1.75,252.0,...,874.0,898.0,0.0,0.0,886.0,886.0,0.0,0.0,12.0,12.0


### Frequency Encoder Numerical Features

In [17]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=None, # False if None
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=True, 
                normalization=False,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:  (800, 20) | X_test shape:  (200, 20)
##################################################
Auto detect cat features:  13
> Start preprocessing Data
> Generate Frequency Encode num features
 +  4  Frequency Encode Num Features 
> Clean Nans in num features
##################################################
> Total Features:  8
##################################################
New X_train shape:  (800, 8) | X_test shape:  (200, 8)


Unnamed: 0,duration,credit_amount,age,num_dependents,FrequencyEncoder_age,FrequencyEncoder_credit_amount,FrequencyEncoder_duration,FrequencyEncoder_num_dependents
0,60.0,6836.0,63.0,0,0.008,0.001,0.013,0.845
1,21.0,2319.0,33.0,0,0.033,0.001,0.03,0.845
2,6.0,1236.0,50.0,0,0.012,0.002,0.075,0.845
3,21.0,5003.0,29.0,0,0.037,0.001,0.03,0.845
4,12.0,886.0,21.0,0,0.014,0.001,0.179,0.845


## Normalization Data
use StandardScaler()

In [18]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=None, # False if None
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=True, 
                normalization=True,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:  (800, 20) | X_test shape:  (200, 20)
##################################################
Auto detect cat features:  13
> Start preprocessing Data
> Generate Frequency Encode num features
 +  4  Frequency Encode Num Features 
> Clean Nans in num features
> Normalization Features
##################################################
> Total Features:  8
##################################################
New X_train shape:  (800, 8) | X_test shape:  (200, 8)


Unnamed: 0,duration,credit_amount,age,num_dependents,FrequencyEncoder_age,FrequencyEncoder_credit_amount,FrequencyEncoder_duration,FrequencyEncoder_num_dependents
0,3.297082,1.199912,2.406187,-0.409736,-1.603822,-0.403815,-1.40662,0.409736
1,-0.008051,-0.35963,-0.224364,-0.409736,0.166108,-0.403815,-1.14286,0.409736
2,-1.279256,-0.733547,1.266282,-0.409736,-1.320634,2.062233,-0.44467,0.409736
3,-0.008051,0.56705,-0.575104,-0.409736,0.449297,-0.403815,-1.14286,0.409736
4,-0.770774,-0.854388,-1.276585,-0.409736,-1.179039,-0.403815,1.168925,0.409736


# Model DataBunch

In [19]:
# After you can pass databunch in model
model = LightGBMClassifier(databunch=data, random_state=RANDOM_SEED)
model._data.X_train.head(5)

Unnamed: 0,duration,credit_amount,age,num_dependents,FrequencyEncoder_age,FrequencyEncoder_credit_amount,FrequencyEncoder_duration,FrequencyEncoder_num_dependents
0,3.297082,1.199912,2.406187,-0.409736,-1.603822,-0.403815,-1.40662,0.409736
1,-0.008051,-0.35963,-0.224364,-0.409736,0.166108,-0.403815,-1.14286,0.409736
2,-1.279256,-0.733547,1.266282,-0.409736,-1.320634,2.062233,-0.44467,0.409736
3,-0.008051,0.56705,-0.575104,-0.409736,0.449297,-0.403815,-1.14286,0.409736
4,-0.770774,-0.854388,-1.276585,-0.409736,-1.179039,-0.403815,1.168925,0.409736


In [20]:
# or you can specify all DataBunch settings in model
model = LightGBMClassifier(
    X_train, 
    y_train, 
    X_test,
    cat_features=None,
    clean_and_encod_data=True,
    cat_encoder_names=['OneHotEncoder', 'HelmertEncoder', 'HashingEncoder', 'FrequencyEncoder'],
    num_generator_features=True, # Generator interaction Num Features
    group_generator_features=False, # Generator Group Encoder Features
    frequency_enc_num_features=True, 
    normalization=True,
    clean_nan=True, # fillnan
    verbose=1,
    random_state=RANDOM_SEED,
    )
model._data.X_train.head(5)

Source X_train shape:  (800, 20) | X_test shape:  (200, 20)
##################################################
Auto detect cat features:  13
> Start preprocessing Data
> Generate cat encodet features
 +  55  Features from  OneHotEncoder
 +  44  Features from  HelmertEncoder
 +  54  Features from  HashingEncoder
 +  16  Features from  FrequencyEncoder
> Generate Frequency Encode num features
 +  4  Frequency Encode Num Features 
> Clean Nans in num features
> Generate interaction Num Features
 +  24  Interaction Features
> Normalization Features
##################################################
> Total Features:  201
##################################################
New X_train shape:  (800, 201) | X_test shape:  (200, 201)


Unnamed: 0,duration,credit_amount,age,num_dependents,OneHotEncoder_installment_commitment,OneHotEncoder_residence_since,OneHotEncoder_other_payment_plans_1,OneHotEncoder_other_payment_plans_2,OneHotEncoder_other_payment_plans_3,OneHotEncoder_existing_credits,...,credit_amount_-_duration,credit_amount_+_duration,credit_amount_/_num_dependents,credit_amount_*_num_dependents,credit_amount_-_num_dependents,credit_amount_+_num_dependents,duration_/_num_dependents,duration_*_num_dependents,duration_-_num_dependents,duration_+_num_dependents
0,3.297082,1.199912,2.406187,-0.409736,0.031196,1.044509,0.468521,-0.397168,-0.213896,1.017777,...,1.189509,1.21025,-0.320701,-0.320701,1.199968,1.199857,-0.344593,-0.344593,3.307063,3.284183
1,-0.008051,-0.35963,-0.224364,-0.409736,-0.860109,-1.67144,0.468521,-0.397168,-0.213896,-0.710931,...,-0.360515,-0.358745,-0.320701,-0.320701,-0.359582,-0.359678,-0.344593,-0.344593,0.004129,-0.020229
2,-1.279256,-0.733547,1.266282,-0.409736,-0.860109,1.044509,0.468521,-0.397168,-0.213896,-0.710931,...,-0.730195,-0.736875,-0.320701,-0.320701,-0.733501,-0.733593,-0.344593,-0.344593,-1.266231,-1.291156
3,-0.008051,0.56705,-0.575104,-0.409736,-1.751413,1.044509,-2.134375,2.517826,-0.213896,1.017777,...,0.56853,0.565571,-0.320701,-0.320701,0.567102,0.566997,-0.344593,-0.344593,0.004129,-0.020229
4,-0.770774,-0.854388,-1.276585,-0.409736,0.9225,-0.766124,0.468521,-0.397168,-0.213896,-0.710931,...,-0.853422,-0.855341,-0.320701,-0.320701,-0.854343,-0.854434,-0.344593,-0.344593,-0.758087,-0.782785


If you need to make changes to the data, you can access the databunch directly in the model model._data.X_train. But I do not recommend doing this.

In [21]:
# fit with default model parameters
predicts = model.predict()
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts['predict_test'][0]),4))

100%|██████████| 1/1 [01:12<00:00, 72.92s/it]
 Mean Score roc_auc_score on 20 Folds: 0.7449 std: 0.060826
Test AUC:  0.832



**Encoder selection is an important part of all AutoML**    
Even with such smart preprocessing, don't forget the basic DS rule: **Garbage in the input is garbage in the output.**

[RUS] Даже с такой умной предобработкой не забывайте: Мусор на входе - мусор на выходе.