In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install automl-alex

In [1]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

import automl_alex
from automl_alex import LightGBMClassifier, DataBunch

print(automl_alex.__version__)

0.07.17


In [2]:
RANDOM_SEED = 42

# Load Data

In [4]:
dataset = fetch_openml(name='adult', version=1, as_frame=True)
# convert target to binary
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [5]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.2, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((39073, 14), (9769, 14))

# Data Cleaning (DataBunch)

In [8]:
X_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
37193,1,Private,50753.0,HS-grad,9.0,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,2,United-States
31093,2,State-gov,144351.0,Masters,14.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,2,United-States
33814,1,Local-gov,252217.0,12th,8.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,2,United-States
14500,4,Private,69525.0,HS-grad,9.0,Divorced,Craft-repair,Unmarried,White,Male,0,0,0,United-States
23399,4,Self-emp-not-inc,28612.0,HS-grad,9.0,Widowed,Sales,Not-in-family,White,Male,0,0,4,United-States


In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 37193 to 15795
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             39073 non-null  category
 1   workclass       36851 non-null  category
 2   fnlwgt          39073 non-null  float64 
 3   education       39073 non-null  category
 4   education-num   39073 non-null  float64 
 5   marital-status  39073 non-null  category
 6   occupation      36842 non-null  category
 7   relationship    39073 non-null  category
 8   race            39073 non-null  category
 9   sex             39073 non-null  category
 10  capitalgain     39073 non-null  category
 11  capitalloss     39073 non-null  category
 12  hoursperweek    39073 non-null  category
 13  native-country  38396 non-null  category
dtypes: category(12), float64(2)
memory usage: 1.3 MB


As we can see, the data is quite dirty, there are object/category features and nans. But the **model is successfully trained even in such a dirty dataset**

[RUS] Как мы видим, данные довольно грязные, есть object/category признаки и nans. Но модель успешно обучаеться даже таком грязном датасете:

In [9]:
model = LightGBMClassifier(X_train, y_train, X_test, random_state=RANDOM_SEED)

In [10]:
# fit with default model parameters
predicts = model.predict()
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts['predict_test'][0]),4))

0%|          | 0/1 [00:00<?, ?it/s]
 Mean Score roc_auc_score on 30 Folds: 0.9144 std: 0.004647
100%|██████████| 1/1 [02:45<00:00, 165.67s/it]
Test AUC:0.912


**How is this possible?**      
[RUS] как это возможно?

<img src="./img/magic.gif" width="400">

## DataBunch
before entering the model, the data goes through a full cycle of pre-processing in DataBunch     
[RUS] до того как попасть в модель, данные проходят полный цикл предобработки.

In [11]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=['OneHotEncoder',], # Encoders list for Generator cat encodet features
                clean_nan=True, # fillnan
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=False,
                normalization=False,
                cat_features=None, # DataBunch can auto detect categorical features
                random_state=RANDOM_SEED)

Source X_train shape:(39073, 14)| X_test shape:(9769, 14)
##################################################
Auto detect cat features:12
> Start preprocessing Data
> Generate cat encodet features
 +121 Features fromOneHotEncoder
> Clean Nans in num features
##################################################
> Total Generated Features:109
##################################################
New X_train shape:(39073, 122)| X_test shape:(9769, 122)


In [12]:
data.X_train.head(5)

Unnamed: 0,fnlwgt,OneHotEncoder_occupation_1,OneHotEncoder_occupation_2,OneHotEncoder_occupation_3,OneHotEncoder_occupation_4,OneHotEncoder_occupation_5,OneHotEncoder_occupation_6,OneHotEncoder_occupation_7,OneHotEncoder_occupation_8,OneHotEncoder_occupation_9,...,OneHotEncoder_age_2,OneHotEncoder_age_3,OneHotEncoder_age_4,OneHotEncoder_age_5,OneHotEncoder_relationship_1,OneHotEncoder_relationship_2,OneHotEncoder_relationship_3,OneHotEncoder_relationship_4,OneHotEncoder_relationship_5,OneHotEncoder_relationship_6
0,50753.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,144351.0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,252217.0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,69525.0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0
4,28612.0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


# Feature Engineering

## Categorical Features 
### Encoders

In [14]:
# available Encoders:
automl_alex.encoders.cat_encoders_names

{'HashingEncoder': category_encoders.hashing.HashingEncoder,
 'SumEncoder': category_encoders.sum_coding.SumEncoder,
 'PolynomialEncoder': category_encoders.polynomial.PolynomialEncoder,
 'BackwardDifferenceEncoder': category_encoders.backward_difference.BackwardDifferenceEncoder,
 'OneHotEncoder': category_encoders.one_hot.OneHotEncoder,
 'HelmertEncoder': category_encoders.helmert.HelmertEncoder,
 'OrdinalEncoder': category_encoders.ordinal.OrdinalEncoder,
 'FrequencyEncoder': automl_alex.encoders.FrequencyEncoder}

In [15]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=['OrdinalEncoder', 'FrequencyEncoder',], # you can choose any encoders
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=False,
                normalization=False,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:(39073, 14)| X_test shape:(9769, 14)
##################################################
Auto detect cat features:12
> Start preprocessing Data
> Generate cat encodet features
 +13 Features fromOrdinalEncoder
 +13 Features fromFrequencyEncoder
> Clean Nans in num features
##################################################
> Total Generated Features:15
##################################################
New X_train shape:(39073, 28)| X_test shape:(9769, 28)


Unnamed: 0,fnlwgt,OrdinalEncoder_occupation,OrdinalEncoder_workclass,OrdinalEncoder_sex,OrdinalEncoder_native-country,OrdinalEncoder_education-num,OrdinalEncoder_race,OrdinalEncoder_capitalgain,OrdinalEncoder_education,OrdinalEncoder_marital-status,...,FrequencyEncoder_education-num,FrequencyEncoder_race,FrequencyEncoder_capitalgain,FrequencyEncoder_education,FrequencyEncoder_marital-status,FrequencyEncoder_capitalloss,FrequencyEncoder_hoursperweek,FrequencyEncoder_age,FrequencyEncoder_relationship,FrequencyEncoder_fnlwgt
0,50753.0,1,1,1,1,9.0,1,1,1,1,...,0.323164,0.855043,0.917387,0.323164,0.458192,0.953278,0.569367,0.260411,0.403669,4.1e-05
1,144351.0,2,2,1,1,14.0,1,1,2,1,...,0.0544,0.855043,0.917387,0.0544,0.458192,0.953278,0.569367,0.244707,0.403669,0.000123
2,252217.0,3,3,1,1,8.0,1,1,3,1,...,0.013452,0.855043,0.917387,0.013452,0.458192,0.953278,0.569367,0.260411,0.403669,2e-05
3,69525.0,4,1,1,1,9.0,1,1,1,2,...,0.323164,0.855043,0.917387,0.323164,0.135805,0.953278,0.091172,0.127923,0.10493,2e-05
4,28612.0,5,4,1,1,9.0,1,1,1,3,...,0.323164,0.855043,0.917387,0.323164,0.03108,0.953278,0.034315,0.127923,0.257627,2e-05


### Encoding cat features by Groupby with numerical features

In [3]:
# add dataset whis more num features
dataset = fetch_openml(name='credit-g', version=1, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes

X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=RANDOM_SEED,)

In [4]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=None, # False if None
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=True, # Generator Group Encoder Features
                frequency_enc_num_features=False,
                normalization=False,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:(800, 20)| X_test shape:(200, 20)
##################################################
Auto detect cat features:13
> Start preprocessing Data
> Clean Nans in num features
> Generate Group Encoder Features
 +64 Group cat Encoder Features
##################################################
> Total Features:68
##################################################
New X_train shape:(800, 68)| X_test shape:(200, 68)


Unnamed: 0,duration,credit_amount,age,num_dependents,GroupEncoder_credit_amount_checking_status,GroupEncoder_credit_amount_housing,GroupEncoder_credit_amount_purpose,GroupEncoder_credit_amount_employment,GroupEncoder_credit_amount_other_payment_plans,GroupEncoder_credit_amount_other_parties,...,GroupEncoder_age_residence_since,GroupEncoder_age_existing_credits,GroupEncoder_age_personal_status,GroupEncoder_age_foreign_worker,GroupEncoder_age_job,GroupEncoder_age_installment_commitment,GroupEncoder_age_own_telephone,GroupEncoder_age_credit_history,GroupEncoder_age_savings_status,GroupEncoder_age_property_magnitude
0,60.0,6836.0,63.0,0,0,1,9,4,2,0,...,3,1,2,0,2,2,1,3,0,3
1,21.0,2319.0,33.0,0,2,0,6,1,2,0,...,0,0,0,0,2,1,0,4,0,2
2,6.0,1236.0,50.0,0,3,0,1,2,2,0,...,3,0,2,0,2,1,0,2,2,1
3,21.0,5003.0,29.0,0,3,1,0,2,0,0,...,3,1,1,0,2,0,1,0,4,1
4,12.0,886.0,21.0,0,3,1,3,2,2,0,...,1,0,1,0,2,3,0,2,4,2


## Numerical Features

In [5]:
data.num_features_names

['credit_amount', 'duration', 'num_dependents', 'age']

### Generator interaction Num Features
Numerical interaction generator features: A/B, A*B, A-B, A+B

In [6]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=None, # False if None
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=True, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=False,
                normalization=False,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:(800, 20)| X_test shape:(200, 20)
##################################################
Auto detect cat features:13
> Start preprocessing Data
> Clean Nans in num features
> Generate interaction Num Features
 +24 Interaction Features
##################################################
> Total Features:28
##################################################
New X_train shape:(800, 28)| X_test shape:(200, 28)


Unnamed: 0,duration,credit_amount,age,num_dependents,credit_amount_/_duration,credit_amount_*_duration,credit_amount_-_duration,credit_amount_+_duration,credit_amount_/_num_dependents,credit_amount_*_num_dependents,...,duration_-_num_dependents,duration_+_num_dependents,duration_/_age,duration_*_age,duration_-_age,duration_+_age,num_dependents_/_age,num_dependents_*_age,num_dependents_-_age,num_dependents_+_age
0,60.0,6836.0,63.0,0,113.933333,410160.0,6776.0,6896.0,0.0,0.0,...,60.0,60.0,0.952381,3780.0,-3.0,123.0,0.0,0.0,-63.0,63.0
1,21.0,2319.0,33.0,0,110.428571,48699.0,2298.0,2340.0,0.0,0.0,...,21.0,21.0,0.636364,693.0,-12.0,54.0,0.0,0.0,-33.0,33.0
2,6.0,1236.0,50.0,0,206.0,7416.0,1230.0,1242.0,0.0,0.0,...,6.0,6.0,0.12,300.0,-44.0,56.0,0.0,0.0,-50.0,50.0
3,21.0,5003.0,29.0,0,238.238095,105063.0,4982.0,5024.0,0.0,0.0,...,21.0,21.0,0.724138,609.0,-8.0,50.0,0.0,0.0,-29.0,29.0
4,12.0,886.0,21.0,0,73.833333,10632.0,874.0,898.0,0.0,0.0,...,12.0,12.0,0.571429,252.0,-9.0,33.0,0.0,0.0,-21.0,21.0


### Frequency Encoder Numerical Features

In [7]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=None, # False if None
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=True, 
                normalization=False,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:(800, 20)| X_test shape:(200, 20)
##################################################
Auto detect cat features:13
> Start preprocessing Data
> Generate Frequency Encode num features
 +4 Frequency Encode Num Features
> Clean Nans in num features
##################################################
> Total Features:8
##################################################
New X_train shape:(800, 8)| X_test shape:(200, 8)


Unnamed: 0,duration,credit_amount,age,num_dependents,FrequencyEncoder_credit_amount,FrequencyEncoder_duration,FrequencyEncoder_num_dependents,FrequencyEncoder_age
0,60.0,6836.0,63.0,0,0.001,0.013,0.845,0.008
1,21.0,2319.0,33.0,0,0.001,0.03,0.845,0.033
2,6.0,1236.0,50.0,0,0.002,0.075,0.845,0.012
3,21.0,5003.0,29.0,0,0.001,0.03,0.845,0.037
4,12.0,886.0,21.0,0,0.001,0.179,0.845,0.014


## Normalization Data
use StandardScaler()

In [9]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_names=None, # False if None
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                num_generator_features=False, # Generator interaction Num Features
                group_generator_features=False, # Generator Group Encoder Features
                frequency_enc_num_features=True, 
                normalization=True,
                random_state=RANDOM_SEED)
data.X_train.head(5)

Source X_train shape:(800, 20)| X_test shape:(200, 20)
##################################################
Auto detect cat features:13
> Start preprocessing Data
> Generate Frequency Encode num features
 +4 Frequency Encode Num Features
> Clean Nans in num features
> Normalization Features
##################################################
> Total Features:8
##################################################
New X_train shape:(800, 8)| X_test shape:(200, 8)


Unnamed: 0,duration,credit_amount,age,num_dependents,FrequencyEncoder_credit_amount,FrequencyEncoder_duration,FrequencyEncoder_num_dependents,FrequencyEncoder_age
0,3.297082,1.199912,2.406187,-0.409736,-0.403815,-1.40662,0.409736,-1.603822
1,-0.008051,-0.35963,-0.224364,-0.409736,-0.403815,-1.14286,0.409736,0.166108
2,-1.279256,-0.733547,1.266282,-0.409736,2.062233,-0.44467,0.409736,-1.320634
3,-0.008051,0.56705,-0.575104,-0.409736,-0.403815,-1.14286,0.409736,0.449297
4,-0.770774,-0.854388,-1.276585,-0.409736,-0.403815,1.168925,0.409736,-1.179039


# Model DataBunch

In [10]:
# After you can pass databunch in model
model = LightGBMClassifier(databunch=data, random_state=RANDOM_SEED)
model._data.X_train.head(5)

Unnamed: 0,duration,credit_amount,age,num_dependents,FrequencyEncoder_credit_amount,FrequencyEncoder_duration,FrequencyEncoder_num_dependents,FrequencyEncoder_age
0,3.297082,1.199912,2.406187,-0.409736,-0.403815,-1.40662,0.409736,-1.603822
1,-0.008051,-0.35963,-0.224364,-0.409736,-0.403815,-1.14286,0.409736,0.166108
2,-1.279256,-0.733547,1.266282,-0.409736,2.062233,-0.44467,0.409736,-1.320634
3,-0.008051,0.56705,-0.575104,-0.409736,-0.403815,-1.14286,0.409736,0.449297
4,-0.770774,-0.854388,-1.276585,-0.409736,-0.403815,1.168925,0.409736,-1.179039


In [13]:
# or you can specify all DataBunch settings in model
model = LightGBMClassifier(
    X_train, 
    y_train, 
    X_test,
    cat_features=None,
    clean_and_encod_data=True,
    cat_encoder_names=['OneHotEncoder', 'HelmertEncoder', 'HashingEncoder', 'FrequencyEncoder'],
    num_generator_features=True, # Generator interaction Num Features
    group_generator_features=False, # Generator Group Encoder Features
    frequency_enc_num_features=True, 
    normalization=True,
    clean_nan=True, # fillnan
    verbose=1,
    random_state=RANDOM_SEED,
    )
model._data.X_train.head(5)

Source X_train shape:(800, 20)| X_test shape:(200, 20)
##################################################
Auto detect cat features:13
> Start preprocessing Data
> Generate cat encodet features
 +55 Features fromOneHotEncoder
 +44 Features fromHelmertEncoder
 +54 Features fromHashingEncoder
 +16 Features fromFrequencyEncoder
> Generate Frequency Encode num features
 +4 Frequency Encode Num Features
> Clean Nans in num features
> Generate interaction Num Features
 +24 Interaction Features
> Normalization Features
##################################################
> Total Features:201
##################################################
New X_train shape:(800, 201)| X_test shape:(200, 201)


Unnamed: 0,duration,credit_amount,age,num_dependents,OneHotEncoder_checking_status_1,OneHotEncoder_checking_status_2,OneHotEncoder_checking_status_3,OneHotEncoder_checking_status_4,OneHotEncoder_housing_1,OneHotEncoder_housing_2,...,duration_-_num_dependents,duration_+_num_dependents,duration_/_age,duration_*_age,duration_-_age,duration_+_age,num_dependents_/_age,num_dependents_*_age,num_dependents_-_age,num_dependents_+_age
0,3.297082,1.199912,2.406187,-0.409736,1.654786,-0.255434,-0.822891,-0.604308,0.629413,-0.460566,...,3.307063,3.284183,0.719309,6.110167,0.682788,4.140952,-0.398178,-0.39846,-2.427029,2.383275
1,-0.008051,-0.35963,-0.224364,-0.409736,-0.604308,3.914911,-0.822891,-0.604308,-1.588782,2.171241,...,0.004129,-0.020229,-0.037499,-0.102263,0.146742,-0.165632,-0.398178,-0.39846,0.21248,-0.235948
2,-1.279256,-0.733547,1.266282,-0.409736,-0.604308,-0.255434,1.215228,-0.604308,-1.588782,2.171241,...,-1.266231,-1.291156,-1.274102,-0.893155,-1.759198,-0.040803,-0.398178,-0.39846,-1.283241,1.248278
3,-0.008051,0.56705,-0.575104,-0.409736,-0.604308,-0.255434,1.215228,-0.604308,0.629413,-0.460566,...,0.004129,-0.020229,0.172706,-0.271308,0.384985,-0.415289,-0.398178,-0.39846,0.564415,-0.585178
4,-0.770774,-0.854388,-1.276585,-0.409736,-0.604308,-0.255434,1.215228,-0.604308,0.629413,-0.460566,...,-0.758087,-0.782785,-0.193007,-0.989752,0.325424,-1.476331,-0.398178,-0.39846,1.268284,-1.283638


If you need to make changes to the data, you can access the databunch directly in the model model._data.X_train. But I do not recommend doing this.

In [14]:
# fit with default model parameters
predicts = model.predict()
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts['predict_test'][0]),4))

0%|          | 0/1 [00:00<?, ?it/s]
 Mean Score roc_auc_score on 30 Folds: 0.7404 std: 0.062749
100%|██████████| 1/1 [00:11<00:00, 11.33s/it]
Test AUC:0.8297


**Encoder selection is an important part of all AutoML**    
Even with such smart preprocessing, don't forget the basic DS rule: **Garbage in the input is garbage in the output.**

[RUS] Даже с такой умной предобработкой не забывайте: Мусор на входе - мусор на выходе.