In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install -U -q automl-alex

In [15]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

#import automl_alex
#from automl_alex import LightGBMClassifier, DataBunch

#print(automl_alex.__version__)

In [16]:
RANDOM_SEED = 42

# Load Data

In [17]:
dataset = fetch_openml(name='adult', version=1, as_frame=True)
# convert target to binary
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [18]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.2, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((39073, 14), (9769, 14))

# Data Cleaning (DataBunch)

In [19]:
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.preprocessing import StandardScaler

In [20]:

from category_encoders import HashingEncoder, SumEncoder, PolynomialEncoder, BackwardDifferenceEncoder 
from category_encoders import OneHotEncoder, HelmertEncoder, OrdinalEncoder, BaseNEncoder
from category_encoders import TargetEncoder, CatBoostEncoder, WOEEncoder, JamesSteinEncoder
from category_encoders.count import CountEncoder

################################################################
            #               Simple Encoders 
            #      (do not use information about target)
################################################################

cat_encoders_names = {
                'HashingEncoder': HashingEncoder,
                'SumEncoder': SumEncoder,
                'BackwardDifferenceEncoder': BackwardDifferenceEncoder,
                'OneHotEncoder': OneHotEncoder,
                'HelmertEncoder': HelmertEncoder,
                'OrdinalEncoder': OrdinalEncoder,
                'BaseNEncoder': BaseNEncoder,
                'CountEncoder': CountEncoder,
                }



################################################################
            #                Target Encoders
################################################################

target_encoders_names = {
                'TargetEncoder': TargetEncoder,
                'CatBoostEncoder': CatBoostEncoder,
                'WOEEncoder': WOEEncoder,
                'JamesSteinEncoder': JamesSteinEncoder,
                }

In [28]:
import pandas as pd
import numpy as np
from itertools import combinations
import pickle


# disable chained assignments
pd.options.mode.chained_assignment = None 


class CleanNans(object):
    """
    Сlass for cleaning Nans
    """

    def __init__(self, method='median'):
        """
        Fill Nans and add column, that there were nans in this column
        
        Args:
            method : {'median', 'mean',}
        """
        self.method = method

    def fit(self, data, cols=None):
        """
        Fit fillna.

        Args:
            data (pd.DataFrame, shape (n_samples, n_features)): the input data
            cols list() features: the input data
        Returns:
            self
        """
        if cols is not None:
            data = data[cols]
        
        data = data._get_numeric_data()
        
        self.nan_columns = list(data.columns[data.isnull().sum() > 0])
        if not self.nan_columns:     
            print('No nans features')

        if self.method is 'median':
            self.fill_value = data.median()
        elif self.method is 'mean':
            self.fill_value = data.mean()
        else:
            raise ValueError('Wrong fill method')

        return self

    def transform(self, data, cols=None):
        """Transforms the dataset.
        Args:
            data (pd.DataFrame, shape (n_samples, n_features)): the input data
            cols list() features: the input data
        Returns:
            pandas.Dataframe of shape = (n_train, n_features)
                The train dataset with no missing values.
        """
        if cols is not None:
            data = data[cols]

        if self.nan_columns:
            for nan_column in self.nan_columns:
                data[nan_column+'_isNAN'] = pd.isna(data[nan_column]).astype('uint8')
            
            data.fillna(self.fill_value, inplace=True)
        else:
            raise ValueError('No nans features')

        return data

    def fit_transform(self, data, cols=None):
        """Fit and transforms the dataset.
        Args:
            data (pd.DataFrame, shape (n_samples, n_features)): the input data
            cols list() features: the input data
        Returns:
            pandas.Dataframe of shape = (n_train, n_features)
                The train dataset with no missing values.
        """
        self.fit(data, cols)

        return self.transform(data)

class DataPrepare(object):
    """
    Сlass for cleaning, encoding and processing your dataset
    """
    def __init__(self, 
                cat_features=None,
                clean_and_encod_data=True,
                cat_encoder_names=['HelmertEncoder','CountEncoder'],
                clean_nan=True,
                num_generator_features=True,
                #group_generator_features=False,
                #frequency_enc_num_features=False,
                random_state=42,
                verbose=1):
        """
        Description of __init__

        Args:
            cat_features=None (list or None): 
            clean_and_encod_data=True (undefined):
            cat_encoder_names=None (list or None):
            clean_nan=True (undefined):
            num_generator_features=True (undefined):
            random_state=42 (undefined):
        """
        self.random_state = random_state
        self.cat_encoder_names = cat_encoder_names
        self.verbose = verbose
        self._clean_and_encod_data = clean_and_encod_data
        self._clean_nan = clean_nan
        self._num_generator_features = num_generator_features
        self.cat_features = cat_features

        self.binary_encoder = None
        self.clean_nan_encoder = None
        self.cat_clean_ord_encoder = None

        self.fit_cat_encoders={}

    def check_data_format(self, data):
        """
        Description of check_data_format:
            Check that data is not pd.DataFrame or empty

        Args:
            data (pd.DataFrame, shape (n_samples, n_features)): the input data
        Return:
            True or Exception
        """
        if (not isinstance(data, pd.DataFrame)) or data.empty:
            raise Exception("data is not pd.DataFrame or empty")

    def check_num_nans(self, data):
        """
        Description of check_num_nans:
            Check Nans in numeric features in data 

        Args:
            data (pd.DataFrame, shape (n_samples, n_features)): the input data
        Return:
            True or Exception
        """
        data = data._get_numeric_data()
        return(len(list(data.columns[data.isnull().sum() > 0])) > 0)

    def auto_detect_cat_features(self, data):
        """
        Description of _auto_detect_cat_features:
            Auto-detection categorical_features by simple rule:
            categorical feature == if feature nunique low 1% of data

        Args:
            data (pd.DataFrame): dataset
            
        Returns:
            cat_features (list): columns names cat features
        
        """
        #object_features = list(data.columns[data.dtypes == 'object'])
        cat_features = data.columns[(data.nunique(dropna=False) < len(data)//100) & \
            (data.nunique(dropna=False) >2)]
        if len(cat_features) < 1:
            cat_features = None
        #cat_features = list(set([*object_features, *cat_features]))
        return(cat_features)
    
    def gen_numeric_interaction_features(self, 
                                        df, 
                                        columns, 
                                        operations=['/','*','-','+'],) -> pd.DataFrame:
        """
        Description of numeric_interaction_terms:
            Numerical interaction generator features: A/B, A*B, A-B,

        Args:
            df (pd.DataFrame):
            columns (list): num columns names
            operations (list): operations type

        Returns:
            pd.DataFrame

        """
        fe_df = pd.DataFrame()
        for c in combinations(columns,2):
            if '/' in operations:
                fe_df['{}_/_{}'.format(c[0], c[1]) ] = (df[c[0]]*1.) / df[c[1]]
            if '*' in operations:
                fe_df['{}_*_{}'.format(c[0], c[1]) ] = df[c[0]] * df[c[1]]
            if '-' in operations:
                fe_df['{}_-_{}'.format(c[0], c[1]) ] = df[c[0]] - df[c[1]]
            if '+' in operations:
                fe_df['{}_+_{}'.format(c[0], c[1]) ] = df[c[0]] + df[c[1]]
        return(fe_df)

    def fit(self, data,):
        """
        Fit DataPrepare.

        Args:
            data (pd.DataFrame, shape (n_samples, n_features)): 
                the input data
        Returns:
            self
        """
        ########### check_data_format ######################
        self.check_data_format(data)

        if self.verbose > 0:   
            print('Source data shape: ', data.shape,)
            print('#'*50)
            print('! START FIT preprocessing Data')

        data = data.reset_index(drop=True)
        ########### Detect type of features ######################

        if self.cat_features is None:
            self.cat_features = self.auto_detect_cat_features(data)
            if self.verbose > 0:
                print('- Auto detect cat features: ', len(self.cat_features))

        self.binary_features = data.columns[data.nunique(dropna=False) <= 2]
        self.num_features = list(set(data.select_dtypes('number').columns) - set(self.binary_features))


        ########### Binary Features ######################
        if len(self.binary_features) > 0:
            if self.verbose > 0:
                    print('> Binary Features')

            self.binary_encoder = OrdinalEncoder()
            self.binary_encoder = self.binary_encoder.fit(data[self.binary_features])

        ########### Categorical Features ######################
        if self.cat_features is not None:
            # Clean Categorical Features
            if self.verbose > 0:
                    print('> Clean Categorical Features')
            self.cat_clean_ord_encoder = OrdinalEncoder()
            self.cat_clean_ord_encoder = self.cat_clean_ord_encoder.fit(data[self.cat_features])
            data[self.cat_features] = self.cat_clean_ord_encoder.transform(data[self.cat_features])


            # Encode Categorical Features
            if self.verbose > 0:
                    print('> Encode Categorical Features.')

            for cat_encoder_name in self.cat_encoder_names:
                if self.verbose > 0:
                    print(' +', cat_encoder_name)

                if cat_encoder_name not in cat_encoders_names.keys():
                    raise Exception(f"{cat_encoder_name} not support!")

                self.fit_cat_encoders[cat_encoder_name] = cat_encoders_names[cat_encoder_name](cols=self.cat_features, drop_invariant=True)
                if cat_encoder_name == 'HashingEncoder':
                    self.fit_cat_encoders[cat_encoder_name] = cat_encoders_names[cat_encoder_name](
                            n_components=int(np.log(len(data.columns))*1000), 
                            drop_invariant=True)
                
                self.fit_cat_encoders[cat_encoder_name] = \
                    self.fit_cat_encoders[cat_encoder_name].fit(data[self.cat_features])

        ########### Numerical Features ######################

        # CleanNans
        if self._clean_nan:
            if self.check_num_nans(data):
                self.clean_nan_encoder = CleanNans()
                self.clean_nan_encoder = self.clean_nan_encoder.fit(data[self.num_features])
                if self.verbose:
                    print('> CleanNans, total nans columns:', \
                        len(self.clean_nan_encoder.nan_columns))
            else:
                if self.verbose:
                    print('  No nans features')

        ########### Final ######################
        if self.verbose:
            print('#'*50)
            print('! END FIT preprocessing Data')
        return self

    def transform(self, data) -> pd.DataFrame:
        """Transform dataset.
        Args:
            data (pd.DataFrame, shape = (n_samples, n_features)): 
                the input data
        Returns:
            data (pd.Dataframe, shape = (n_train, n_features)):
                The dataset with clean numerical and encoded categorical features.
        """
        if self.verbose > 0:
            start_columns = len(data.columns)
            print('#'*50)
            print('! Start Transform Data')

        data = data.reset_index(drop=True)

        ########### Binary Features ######################
        
        if self.binary_encoder:
            data[self.binary_features] = self.binary_encoder.transform(data[self.binary_features]).replace(2,0).astype('category')
            if self.verbose:
                print('> Clean Binary Features')

        ########### Categorical Features ######################
        if self.cat_features is not None:
            # Clean Categorical Features
            if self.verbose > 0:
                print('> Clean Categorical Features')
            data[self.cat_features] = self.cat_clean_ord_encoder.transform(data[self.cat_features])

            # Encode Categorical Features
            if self.verbose > 0:
                print('> Transform Categorical Features.')
            for cat_encoder_name in self.cat_encoder_names:
                data_encodet = self.fit_cat_encoders[cat_encoder_name].transform(data[self.cat_features])
                data_encodet = data_encodet.add_prefix(cat_encoder_name + '_')
                if self.verbose > 0:
                    print(' - Encoder:', cat_encoder_name, 'ADD features:', len(data_encodet.columns))
                data = data.join(data_encodet.reset_index(drop=True))
        

        ########### Numerical Features ######################
        # CleanNans
        if self.clean_nan_encoder:
            data = self.clean_nan_encoder.transform(data)
            if self.verbose:
                print('> Clean Nans')

        # Generator interaction Num Features
        if self._num_generator_features:
            if len(self.num_features) > 1:
                if self.verbose > 0:
                    print('> Generate interaction Num Features')
                fe_df = self.gen_numeric_interaction_features(data[self.num_features], 
                                                            self.num_features,
                                                            operations=['/','*','-','+'],)
                data = data.join(fe_df.reset_index(drop=True))
                if self.verbose > 0:
                    print(' ADD features:', fe_df.shape[1],)
        
        data.replace([np.inf, -np.inf], np.nan, inplace=True)
        data.fillna(0, inplace=True)


        ########### Final ######################
        if self.verbose > 0:
            end_columns = len(data.columns)
            print('#'*50)
            print('Final data shape: ', data.shape,)
            print('Total ADD columns:', end_columns-start_columns)
        return data

    def fit_transform(self, data,) -> pd.DataFrame:
        """Fits and transforms the dataset.
        Args:
            data (pd.DataFrame, shape = (n_samples, n_features)): 
                the input data
        Returns:
            data (pd.Dataframe, shape = (n_train, n_features)):
                The dataset with clean numerical and encoded categorical features.
        """
        self.fit(data)

        return self.transform(data)

    def save(self, name):
        pickle.dump(self, open(name+'.pkl', 'wb'), protocol=4)

    def load(self, name):
        return(pickle.load(open(name+'.pkl', 'rb')))

In [22]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.2, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((39073, 14), (9769, 14))

In [30]:
de = DataPrepare()
de = de.fit(X_train)

In [24]:
X_train = de.transform(X_train)

##################################################
! Start Transform Data
> Clean Binary Features
> Clean Categorical Features
> Transform Categorical Features.
 - Encoder: HelmertEncoder ADD features: 123
 - Encoder: CountEncoder ADD features: 12
> Generate interaction Num Features
 ADD features: 4
##################################################
Final data shape:  (39073, 153)
Total ADD columns: 139


In [25]:
de.save('de')

In [31]:
de = de.load('de')

In [32]:
de.transform(X_test)

##################################################
! Start Transform Data
> Clean Binary Features
> Clean Categorical Features
> Transform Categorical Features.
 - Encoder: HelmertEncoder ADD features: 123
 - Encoder: CountEncoder ADD features: 12
> Generate interaction Num Features
 ADD features: 4
##################################################
Final data shape:  (9769, 153)
Total ADD columns: 139


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,CountEncoder_relationship,CountEncoder_race,CountEncoder_capitalgain,CountEncoder_capitalloss,CountEncoder_hoursperweek,CountEncoder_native-country,fnlwgt_/_education-num,fnlwgt_*_education-num,fnlwgt_-_education-num,fnlwgt_+_education-num
0,5,1,423024.0,1,9.0,4,6,3,1,1,...,10067,33425,35788,37253,3582,35123,47002.666667,3807216.0,423015.0,423033.0
1,5,1,178953.0,3,8.0,4,5,5,1,0,...,6122,33425,35788,37253,3582,35123,22369.125000,1431624.0,178945.0,178961.0
2,5,3,348986.0,1,9.0,4,10,4,2,1,...,1180,3734,35788,37253,22283,35123,38776.222222,3140874.0,348977.0,348995.0
3,5,1,218215.0,6,10.0,4,5,5,1,0,...,6122,33425,35788,37253,4699,35123,21821.500000,2182150.0,218205.0,218225.0
4,4,1,244025.0,1,9.0,4,11,2,5,1,...,4059,374,35788,37253,7170,152,27113.888889,2196225.0,244016.0,244034.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9764,3,4,118614.0,2,14.0,5,5,2,1,0,...,4059,33425,35788,37253,22283,35123,8472.428571,1660596.0,118600.0,118628.0
9765,5,1,205838.0,1,9.0,4,6,5,1,1,...,6122,33425,35788,37253,22283,35123,22870.888889,1852542.0,205829.0,205847.0
9766,2,1,194304.0,6,10.0,2,1,3,2,1,...,10067,3734,35788,37253,7170,35123,19430.400000,1943040.0,194294.0,194314.0
9767,4,4,245724.0,6,10.0,2,8,3,1,1,...,10067,33425,35788,37253,7170,35123,24572.400000,2457240.0,245714.0,245734.0


In [58]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39073 entries, 0 to 39072
Columns: 149 entries, age to CountEncoder_native-country
dtypes: category(1), float64(126), int64(22)
memory usage: 44.2 MB


As we can see, the data is quite dirty, there are object/category features and nans. But the **model is successfully trained even in such a dirty dataset**

[RUS] Как мы видим, данные довольно грязные, есть object/category признаки и nans. Но модель успешно обучаеться даже таком грязном датасете:

In [7]:
model = LightGBMClassifier(X_train, y_train, X_test, random_state=RANDOM_SEED)

In [8]:
# fit with default model parameters
predicts = model.predict()
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts['predict_test'][0]),4))

100%|██████████| 1/1 [02:13<00:00, 133.73s/it]
 Mean Score roc_auc_score on 20 Folds: 0.9142 std: 0.00485
Test AUC:  0.9119



**How is this possible?**      
[RUS] как это возможно?

<img src="./img/magic.gif" width="400">

## DataBunch
before entering the model, the data goes through a full cycle of pre-processing in DataBunch     
[RUS] до того как попасть в модель, данные проходят полный цикл предобработки.

In [9]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=['OneHotEncoder',], # Encoders list for Generator cat encodet features
                clean_nan=True, # fillnan
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=False,
                normalization=False,
                cat_features=None, # DataBunch can auto detect categorical features
                random_state=RANDOM_SEED)

Source X_train shape:  (39073, 14) | X_test shape:  (9769, 14)
##################################################
Auto detect cat features:  12
> Start preprocessing Data
> Generate cat encodet features
 +  121  Features from  OneHotEncoder
> Clean Nans in num features
##################################################
> Total Features:  122
##################################################
New X_train shape:  (39073, 122) | X_test shape:  (9769, 122)


In [10]:
data.X_train.head(5)

Unnamed: 0,fnlwgt,OneHotEncoder_relationship_1,OneHotEncoder_relationship_2,OneHotEncoder_relationship_3,OneHotEncoder_relationship_4,OneHotEncoder_relationship_5,OneHotEncoder_relationship_6,OneHotEncoder_race_1,OneHotEncoder_race_2,OneHotEncoder_race_3,...,OneHotEncoder_education_14,OneHotEncoder_education_15,OneHotEncoder_education_16,OneHotEncoder_marital-status_1,OneHotEncoder_marital-status_2,OneHotEncoder_marital-status_3,OneHotEncoder_marital-status_4,OneHotEncoder_marital-status_5,OneHotEncoder_marital-status_6,OneHotEncoder_marital-status_7
0,50753.0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
1,144351.0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
2,252217.0,1,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
3,69525.0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,28612.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


# Feature Engineering

## Categorical Features 
### Encoders

In [11]:
# available Encoders:
automl_alex.encoders.cat_encoders_names

{'HashingEncoder': category_encoders.hashing.HashingEncoder,
 'SumEncoder': category_encoders.sum_coding.SumEncoder,
 'PolynomialEncoder': category_encoders.polynomial.PolynomialEncoder,
 'BackwardDifferenceEncoder': category_encoders.backward_difference.BackwardDifferenceEncoder,
 'OneHotEncoder': category_encoders.one_hot.OneHotEncoder,
 'HelmertEncoder': category_encoders.helmert.HelmertEncoder,
 'OrdinalEncoder': category_encoders.ordinal.OrdinalEncoder,
 'FrequencyEncoder': automl_alex.encoders.FrequencyEncoder,
 'BaseNEncoder': category_encoders.basen.BaseNEncoder}

In [12]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=['OrdinalEncoder', 'FrequencyEncoder',], # you can choose any encoders
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=False,
                normalization=False,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:  (39073, 14) | X_test shape:  (9769, 14)
##################################################
Auto detect cat features:  12
> Start preprocessing Data
> Generate cat encodet features
 +  13  Features from  OrdinalEncoder
 +  13  Features from  FrequencyEncoder
> Clean Nans in num features
##################################################
> Total Features:  27
##################################################
New X_train shape:  (39073, 27) | X_test shape:  (9769, 27)


Unnamed: 0,fnlwgt,OrdinalEncoder_relationship,OrdinalEncoder_race,OrdinalEncoder_hoursperweek,OrdinalEncoder_workclass,OrdinalEncoder_occupation,OrdinalEncoder_capitalloss,OrdinalEncoder_native-country,OrdinalEncoder_sex,OrdinalEncoder_age,...,FrequencyEncoder_workclass,FrequencyEncoder_occupation,FrequencyEncoder_capitalloss,FrequencyEncoder_native-country,FrequencyEncoder_sex,FrequencyEncoder_age,FrequencyEncoder_education-num,FrequencyEncoder_capitalgain,FrequencyEncoder_education,FrequencyEncoder_marital-status
0,50753.0,1,1,1,1,1,1,1,1,1,...,0.694198,0.048217,0.953278,0.897424,0.668482,0.260411,0.323164,0.917387,0.323164,0.458192
1,144351.0,1,1,1,2,2,1,1,1,2,...,0.040559,0.126367,0.953278,0.897424,0.668482,0.244707,0.0544,0.917387,0.0544,0.458192
2,252217.0,1,1,1,3,3,1,1,1,1,...,0.064207,0.030507,0.953278,0.897424,0.668482,0.260411,0.013452,0.917387,0.013452,0.458192
3,69525.0,2,1,2,1,4,1,1,1,3,...,0.694198,0.125138,0.953278,0.897424,0.668482,0.127923,0.323164,0.917387,0.323164,0.135805
4,28612.0,3,1,3,4,5,1,1,1,3,...,0.079071,0.11269,0.953278,0.897424,0.668482,0.127923,0.323164,0.917387,0.323164,0.03108


### Encoding cat features by Groupby with numerical features

In [13]:
# add dataset whis more num features
dataset = fetch_openml(name='credit-g', version=1, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes

X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=RANDOM_SEED,)

In [14]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=None, # False if None
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=True, # Generator Group Encoder Features
                frequency_enc_num_features=False,
                normalization=False,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:  (800, 20) | X_test shape:  (200, 20)
##################################################
Auto detect cat features:  13
> Start preprocessing Data
> Clean Nans in num features
> Generate Group Encoder Features
 +  64  Group cat Encoder Features
##################################################
> Total Features:  68
##################################################
New X_train shape:  (800, 68) | X_test shape:  (200, 68)


Unnamed: 0,duration,credit_amount,age,num_dependents,GroupEncoder_age_installment_commitment,GroupEncoder_age_residence_since,GroupEncoder_age_other_payment_plans,GroupEncoder_age_existing_credits,GroupEncoder_age_foreign_worker,GroupEncoder_age_housing,...,GroupEncoder_num_dependents_savings_status,GroupEncoder_num_dependents_credit_history,GroupEncoder_num_dependents_job,GroupEncoder_num_dependents_property_magnitude,GroupEncoder_num_dependents_employment,GroupEncoder_num_dependents_checking_status,GroupEncoder_num_dependents_purpose,GroupEncoder_num_dependents_other_parties,GroupEncoder_num_dependents_personal_status,GroupEncoder_num_dependents_own_telephone
0,60.0,6836.0,63.0,0,2,3,2,1,0,1,...,0,3,2,3,4,0,9,0,2,1
1,21.0,2319.0,33.0,0,1,0,2,0,0,0,...,0,4,2,2,1,2,6,0,0,0
2,6.0,1236.0,50.0,0,1,3,2,0,0,0,...,2,2,2,1,2,3,1,0,2,0
3,21.0,5003.0,29.0,0,0,3,0,1,0,1,...,4,0,2,1,2,3,0,0,1,1
4,12.0,886.0,21.0,0,3,1,2,0,0,1,...,4,2,2,2,2,3,3,0,1,0


## Numerical Features

In [15]:
data.num_features_names

['age', 'credit_amount', 'duration', 'num_dependents']

### Generator interaction Num Features
Numerical interaction generator features: A/B, A*B, A-B, A+B

In [16]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=None, # False if None
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=True, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=False,
                normalization=False,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:  (800, 20) | X_test shape:  (200, 20)
##################################################
Auto detect cat features:  13
> Start preprocessing Data
> Clean Nans in num features
> Generate interaction Num Features
 +  24  Interaction Features
##################################################
> Total Features:  28
##################################################
New X_train shape:  (800, 28) | X_test shape:  (200, 28)


Unnamed: 0,duration,credit_amount,age,num_dependents,age_/_credit_amount,age_*_credit_amount,age_-_credit_amount,age_+_credit_amount,age_/_duration,age_*_duration,...,credit_amount_-_duration,credit_amount_+_duration,credit_amount_/_num_dependents,credit_amount_*_num_dependents,credit_amount_-_num_dependents,credit_amount_+_num_dependents,duration_/_num_dependents,duration_*_num_dependents,duration_-_num_dependents,duration_+_num_dependents
0,60.0,6836.0,63.0,0,0.009216,430668.0,-6773.0,6899.0,1.05,3780.0,...,6776.0,6896.0,0.0,0.0,6836.0,6836.0,0.0,0.0,60.0,60.0
1,21.0,2319.0,33.0,0,0.01423,76527.0,-2286.0,2352.0,1.571429,693.0,...,2298.0,2340.0,0.0,0.0,2319.0,2319.0,0.0,0.0,21.0,21.0
2,6.0,1236.0,50.0,0,0.040453,61800.0,-1186.0,1286.0,8.333333,300.0,...,1230.0,1242.0,0.0,0.0,1236.0,1236.0,0.0,0.0,6.0,6.0
3,21.0,5003.0,29.0,0,0.005797,145087.0,-4974.0,5032.0,1.380952,609.0,...,4982.0,5024.0,0.0,0.0,5003.0,5003.0,0.0,0.0,21.0,21.0
4,12.0,886.0,21.0,0,0.023702,18606.0,-865.0,907.0,1.75,252.0,...,874.0,898.0,0.0,0.0,886.0,886.0,0.0,0.0,12.0,12.0


### Frequency Encoder Numerical Features

In [17]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=None, # False if None
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=True, 
                normalization=False,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:  (800, 20) | X_test shape:  (200, 20)
##################################################
Auto detect cat features:  13
> Start preprocessing Data
> Generate Frequency Encode num features
 +  4  Frequency Encode Num Features 
> Clean Nans in num features
##################################################
> Total Features:  8
##################################################
New X_train shape:  (800, 8) | X_test shape:  (200, 8)


Unnamed: 0,duration,credit_amount,age,num_dependents,FrequencyEncoder_age,FrequencyEncoder_credit_amount,FrequencyEncoder_duration,FrequencyEncoder_num_dependents
0,60.0,6836.0,63.0,0,0.008,0.001,0.013,0.845
1,21.0,2319.0,33.0,0,0.033,0.001,0.03,0.845
2,6.0,1236.0,50.0,0,0.012,0.002,0.075,0.845
3,21.0,5003.0,29.0,0,0.037,0.001,0.03,0.845
4,12.0,886.0,21.0,0,0.014,0.001,0.179,0.845


## Normalization Data
use StandardScaler()

In [18]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=None, # False if None
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=True, 
                normalization=True,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:  (800, 20) | X_test shape:  (200, 20)
##################################################
Auto detect cat features:  13
> Start preprocessing Data
> Generate Frequency Encode num features
 +  4  Frequency Encode Num Features 
> Clean Nans in num features
> Normalization Features
##################################################
> Total Features:  8
##################################################
New X_train shape:  (800, 8) | X_test shape:  (200, 8)


Unnamed: 0,duration,credit_amount,age,num_dependents,FrequencyEncoder_age,FrequencyEncoder_credit_amount,FrequencyEncoder_duration,FrequencyEncoder_num_dependents
0,3.297082,1.199912,2.406187,-0.409736,-1.603822,-0.403815,-1.40662,0.409736
1,-0.008051,-0.35963,-0.224364,-0.409736,0.166108,-0.403815,-1.14286,0.409736
2,-1.279256,-0.733547,1.266282,-0.409736,-1.320634,2.062233,-0.44467,0.409736
3,-0.008051,0.56705,-0.575104,-0.409736,0.449297,-0.403815,-1.14286,0.409736
4,-0.770774,-0.854388,-1.276585,-0.409736,-1.179039,-0.403815,1.168925,0.409736


# Model DataBunch

In [19]:
# After you can pass databunch in model
model = LightGBMClassifier(databunch=data, random_state=RANDOM_SEED)
model._data.X_train.head(5)

Unnamed: 0,duration,credit_amount,age,num_dependents,FrequencyEncoder_age,FrequencyEncoder_credit_amount,FrequencyEncoder_duration,FrequencyEncoder_num_dependents
0,3.297082,1.199912,2.406187,-0.409736,-1.603822,-0.403815,-1.40662,0.409736
1,-0.008051,-0.35963,-0.224364,-0.409736,0.166108,-0.403815,-1.14286,0.409736
2,-1.279256,-0.733547,1.266282,-0.409736,-1.320634,2.062233,-0.44467,0.409736
3,-0.008051,0.56705,-0.575104,-0.409736,0.449297,-0.403815,-1.14286,0.409736
4,-0.770774,-0.854388,-1.276585,-0.409736,-1.179039,-0.403815,1.168925,0.409736


In [20]:
# or you can specify all DataBunch settings in model
model = LightGBMClassifier(
    X_train, 
    y_train, 
    X_test,
    cat_features=None,
    clean_and_encod_data=True,
    cat_encoder_names=['OneHotEncoder', 'HelmertEncoder', 'HashingEncoder', 'FrequencyEncoder'],
    num_generator_features=True, # Generator interaction Num Features
    group_generator_features=False, # Generator Group Encoder Features
    frequency_enc_num_features=True, 
    normalization=True,
    clean_nan=True, # fillnan
    verbose=1,
    random_state=RANDOM_SEED,
    )
model._data.X_train.head(5)

Source X_train shape:  (800, 20) | X_test shape:  (200, 20)
##################################################
Auto detect cat features:  13
> Start preprocessing Data
> Generate cat encodet features
 +  55  Features from  OneHotEncoder
 +  44  Features from  HelmertEncoder
 +  54  Features from  HashingEncoder
 +  16  Features from  FrequencyEncoder
> Generate Frequency Encode num features
 +  4  Frequency Encode Num Features 
> Clean Nans in num features
> Generate interaction Num Features
 +  24  Interaction Features
> Normalization Features
##################################################
> Total Features:  201
##################################################
New X_train shape:  (800, 201) | X_test shape:  (200, 201)


Unnamed: 0,duration,credit_amount,age,num_dependents,OneHotEncoder_installment_commitment,OneHotEncoder_residence_since,OneHotEncoder_other_payment_plans_1,OneHotEncoder_other_payment_plans_2,OneHotEncoder_other_payment_plans_3,OneHotEncoder_existing_credits,...,credit_amount_-_duration,credit_amount_+_duration,credit_amount_/_num_dependents,credit_amount_*_num_dependents,credit_amount_-_num_dependents,credit_amount_+_num_dependents,duration_/_num_dependents,duration_*_num_dependents,duration_-_num_dependents,duration_+_num_dependents
0,3.297082,1.199912,2.406187,-0.409736,0.031196,1.044509,0.468521,-0.397168,-0.213896,1.017777,...,1.189509,1.21025,-0.320701,-0.320701,1.199968,1.199857,-0.344593,-0.344593,3.307063,3.284183
1,-0.008051,-0.35963,-0.224364,-0.409736,-0.860109,-1.67144,0.468521,-0.397168,-0.213896,-0.710931,...,-0.360515,-0.358745,-0.320701,-0.320701,-0.359582,-0.359678,-0.344593,-0.344593,0.004129,-0.020229
2,-1.279256,-0.733547,1.266282,-0.409736,-0.860109,1.044509,0.468521,-0.397168,-0.213896,-0.710931,...,-0.730195,-0.736875,-0.320701,-0.320701,-0.733501,-0.733593,-0.344593,-0.344593,-1.266231,-1.291156
3,-0.008051,0.56705,-0.575104,-0.409736,-1.751413,1.044509,-2.134375,2.517826,-0.213896,1.017777,...,0.56853,0.565571,-0.320701,-0.320701,0.567102,0.566997,-0.344593,-0.344593,0.004129,-0.020229
4,-0.770774,-0.854388,-1.276585,-0.409736,0.9225,-0.766124,0.468521,-0.397168,-0.213896,-0.710931,...,-0.853422,-0.855341,-0.320701,-0.320701,-0.854343,-0.854434,-0.344593,-0.344593,-0.758087,-0.782785


If you need to make changes to the data, you can access the databunch directly in the model model._data.X_train. But I do not recommend doing this.

In [21]:
# fit with default model parameters
predicts = model.predict()
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts['predict_test'][0]),4))

100%|██████████| 1/1 [01:12<00:00, 72.92s/it]
 Mean Score roc_auc_score on 20 Folds: 0.7449 std: 0.060826
Test AUC:  0.832



**Encoder selection is an important part of all AutoML**    
Even with such smart preprocessing, don't forget the basic DS rule: **Garbage in the input is garbage in the output.**

[RUS] Даже с такой умной предобработкой не забывайте: Мусор на входе - мусор на выходе.