In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install automl-alex

In [1]:
import sklearn
from sklearn.model_selection import train_test_split

import automl_alex
from automl_alex import LightGBMClassifier, DataBunch

print(automl_alex.__version__)

0.07.02


In [2]:
RANDOM_SEED = 42

# Load Data

In [3]:
from sklearn.datasets import fetch_openml

In [4]:
dataset = fetch_openml(name='adult', version=1, as_frame=True)
# convert target to binary
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [5]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.25, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((36631, 14), (12211, 14))

# Data Cleaning and Encoding (DataBunch)

In [6]:
X_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
27859,3,Private,304857.0,Masters,14.0,Separated,Tech-support,Not-in-family,White,Male,4,0,2,United-States
5654,0,Private,189590.0,Bachelors,13.0,Never-married,Tech-support,Not-in-family,White,Male,0,0,2,United-States
3779,4,Private,96299.0,HS-grad,9.0,Divorced,Transport-moving,Unmarried,White,Male,0,0,2,United-States
10522,4,Self-emp-not-inc,196307.0,Bachelors,13.0,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,0,United-States
22461,0,Private,265434.0,Some-college,10.0,Never-married,Prof-specialty,Own-child,White,Female,0,0,1,United-States


In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36631 entries, 27859 to 15795
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             36631 non-null  category
 1   workclass       34534 non-null  category
 2   fnlwgt          36631 non-null  float64 
 3   education       36631 non-null  category
 4   education-num   36631 non-null  float64 
 5   marital-status  36631 non-null  category
 6   occupation      34525 non-null  category
 7   relationship    36631 non-null  category
 8   race            36631 non-null  category
 9   sex             36631 non-null  category
 10  capitalgain     36631 non-null  category
 11  capitalloss     36631 non-null  category
 12  hoursperweek    36631 non-null  category
 13  native-country  35993 non-null  category
dtypes: category(12), float64(2)
memory usage: 1.3 MB


As we can see, the data is quite dirty, there are object/category features and nans. But the **model is successfully trained even in such a dirty dataset**

[RUS] Как мы видим, данные довольно грязные, есть object/category признаки и nans. Но модель успешно обучаеться даже таком грязном датасете:

In [8]:
model = LightGBMClassifier(X_train, y_train, X_test, random_state=RANDOM_SEED)

In [9]:
# fit with default model parameters
predicts = model.predict()
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts['predict_test'][0]),4))

0%|          | 0/1 [00:00<?, ?it/s]
 Mean Score roc_auc_score on 30 Folds: 0.9131 std: 0.004999
100%|██████████| 1/1 [02:48<00:00, 168.25s/it]
Test AUC:0.9122


**How is this possible?**      
[RUS] как это возможно?

<img src="./img/magic.gif" width="400">

## DataBunch
before entering the model, the data goes through a full cycle of pre-processing in DataBunch     
[RUS] до того как попасть в модель, данные проходят полный цикл предобработки.

In [8]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_name='OneHotEncoder',
                clean_nan=True, # fillnan
                cat_features=None, # DataBunch can auto detect categorical features
                random_state=RANDOM_SEED)

In [9]:
data.X_train.head(5)

Unnamed: 0,fnlwgt,OneHotEncoder_age_1,OneHotEncoder_age_2,OneHotEncoder_age_3,OneHotEncoder_age_4,OneHotEncoder_age_5,OneHotEncoder_capitalloss_1,OneHotEncoder_capitalloss_2,OneHotEncoder_capitalloss_3,OneHotEncoder_capitalloss_4,...,OneHotEncoder_capitalgain_2,OneHotEncoder_capitalgain_3,OneHotEncoder_capitalgain_4,OneHotEncoder_capitalgain_5,OneHotEncoder_relationship_1,OneHotEncoder_relationship_2,OneHotEncoder_relationship_3,OneHotEncoder_relationship_4,OneHotEncoder_relationship_5,OneHotEncoder_relationship_6
0,304857.0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,189590.0,0,1,0,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,96299.0,0,0,1,0,0,1,0,0,0,...,1,0,0,0,0,1,0,0,0,0
3,196307.0,0,0,1,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
4,265434.0,0,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,1,0,0,0


## Encoders

In [10]:
# available Encoders:
automl_alex.encoders.encoders_names

{'HashingEncoder': category_encoders.hashing.HashingEncoder,
 'SumEncoder': category_encoders.sum_coding.SumEncoder,
 'OneHotEncoder': category_encoders.one_hot.OneHotEncoder,
 'HelmertEncoder': category_encoders.helmert.HelmertEncoder,
 'OrdinalEncoder': category_encoders.ordinal.OrdinalEncoder}

In [12]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_name='HelmertEncoder',
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                random_state=RANDOM_SEED)
data.X_train.head(5)

Unnamed: 0,fnlwgt,HelmertEncoder_age_0,HelmertEncoder_age_1,HelmertEncoder_age_2,HelmertEncoder_age_3,HelmertEncoder_capitalloss_0,HelmertEncoder_capitalloss_1,HelmertEncoder_capitalloss_2,HelmertEncoder_capitalloss_3,HelmertEncoder_education-num,...,HelmertEncoder_education_14,HelmertEncoder_capitalgain_0,HelmertEncoder_capitalgain_1,HelmertEncoder_capitalgain_2,HelmertEncoder_capitalgain_3,HelmertEncoder_relationship_0,HelmertEncoder_relationship_1,HelmertEncoder_relationship_2,HelmertEncoder_relationship_3,HelmertEncoder_relationship_4
0,304857.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,14.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,189590.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,13.0,...,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,96299.0,0.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,9.0,...,-1.0,1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,-1.0
3,196307.0,0.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,13.0,...,-1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,265434.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,10.0,...,-1.0,1.0,-1.0,-1.0,-1.0,0.0,2.0,-1.0,-1.0,-1.0


**Please note:** *Target Encoders processing takes place **inside cross-validation**. This shows unprocessed features after adding Target Encoders*

[RUS] *Обратите внимание что обработка Target Encoder происходит внутри кроссвалидации. По этому видны не обработанные признаки после добавления Target Encoders.*

## Model databunch

In [15]:
# After you can pass databunch in model
model = LightGBMClassifier(databunch=data, random_state=RANDOM_SEED)
model._data.X_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,HelmertEncoder_marital-status_0,HelmertEncoder_marital-status_1,HelmertEncoder_marital-status_2,HelmertEncoder_marital-status_3,HelmertEncoder_marital-status_4,HelmertEncoder_marital-status_5,HelmertEncoder_hoursperweek_0,HelmertEncoder_hoursperweek_1,HelmertEncoder_hoursperweek_2,HelmertEncoder_hoursperweek_3
0,3,Private,304857.0,Masters,14.0,Separated,Tech-support,Not-in-family,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,0,Private,189590.0,Bachelors,13.0,Never-married,Tech-support,Not-in-family,White,1,...,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,4,Private,96299.0,HS-grad,9.0,Divorced,Transport-moving,Unmarried,White,1,...,0.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,4,Self-emp-not-inc,196307.0,Bachelors,13.0,Never-married,Prof-specialty,Not-in-family,White,1,...,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0
4,0,Private,265434.0,Some-college,10.0,Never-married,Prof-specialty,Own-child,White,0,...,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,2.0,-1.0,-1.0


In [16]:
# or you can specify all DataBunch settings in model
model = LightGBMClassifier(X_train, 
                            y_train, 
                            X_test,
                            cat_features=None,
                            clean_and_encod_data=True,
                            cat_encoder_name='HelmertEncoder',
                            target_encoder_name='JamesSteinEncoder',
                            clean_nan=True,
                            random_state=RANDOM_SEED)
model._data.X_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,HelmertEncoder_marital-status_0,HelmertEncoder_marital-status_1,HelmertEncoder_marital-status_2,HelmertEncoder_marital-status_3,HelmertEncoder_marital-status_4,HelmertEncoder_marital-status_5,HelmertEncoder_hoursperweek_0,HelmertEncoder_hoursperweek_1,HelmertEncoder_hoursperweek_2,HelmertEncoder_hoursperweek_3
0,3,Private,304857.0,Masters,14.0,Separated,Tech-support,Not-in-family,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,0,Private,189590.0,Bachelors,13.0,Never-married,Tech-support,Not-in-family,White,1,...,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,4,Private,96299.0,HS-grad,9.0,Divorced,Transport-moving,Unmarried,White,1,...,0.0,2.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,4,Self-emp-not-inc,196307.0,Bachelors,13.0,Never-married,Prof-specialty,Not-in-family,White,1,...,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0
4,0,Private,265434.0,Some-college,10.0,Never-married,Prof-specialty,Own-child,White,0,...,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,2.0,-1.0,-1.0


If you need to make changes to the data, you can access the databunch directly in the model model._data.X_train. But I do not recommend doing this.

In [22]:
# fit with default model parameters
predicts = model.predict()
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts['predict_test'][0]),4))

0%|          | 0/1 [00:00<?, ?it/s]
 Mean Score roc_auc_score on 30 Folds: 0.9131 std: 0.005
100%|██████████| 1/1 [00:49<00:00, 49.84s/it]
Test AUC:0.9122


## Opt Encoders

In [23]:
history = model.opt(timeout=300, opt_encoders=True)

One iteration takes ~ 7.8 sec
Start Auto calibration parameters
Start optimization with the parameters:
Score_folds =1
Opt_lvl =1
Cold_start =63.0
Early_stoping =100
Metric =roc_auc_score
Direction =maximize
Default model roc_auc_score = 0.9102
########################################
Optimize: : 31it [05:49, 11.29s/it, LightGBM Best Score roc_auc_score = 0.9146 ]


In [24]:
history.head(10)

Unnamed: 0,score_opt,model_score,score_std,model_name,model_param,wrapper_params,cat_encoder,target_encoder
2,0.9146,0.9146,0,LightGBM,"{'random_seed': 42, 'early_stopping_rounds': 5...","{'need_norm_data': False, 'early_stopping': Fa...",OneHotEncoder,JamesSteinEncoder
23,0.9146,0.9146,0,LightGBM,"{'random_seed': 42, 'early_stopping_rounds': 5...","{'need_norm_data': False, 'early_stopping': Fa...",HashingEncoder,JamesSteinEncoder
0,0.9144,0.9144,0,LightGBM,"{'random_seed': 42, 'early_stopping_rounds': 5...","{'need_norm_data': False, 'early_stopping': Fa...",HelmertEncoder,TargetEncoder
29,0.9144,0.9144,0,LightGBM,"{'random_seed': 42, 'early_stopping_rounds': 5...","{'need_norm_data': False, 'early_stopping': Fa...",OrdinalEncoder,JamesSteinEncoder
22,0.9143,0.9143,0,LightGBM,"{'random_seed': 42, 'early_stopping_rounds': 5...","{'need_norm_data': False, 'early_stopping': Fa...",HashingEncoder,JamesSteinEncoder
12,0.9139,0.9139,0,LightGBM,"{'random_seed': 42, 'early_stopping_rounds': 5...","{'need_norm_data': False, 'early_stopping': Fa...",SumEncoder,JamesSteinEncoder
10,0.9139,0.9139,0,LightGBM,"{'random_seed': 42, 'early_stopping_rounds': 5...","{'need_norm_data': False, 'early_stopping': Fa...",HelmertEncoder,TargetEncoder
11,0.9134,0.9134,0,LightGBM,"{'random_seed': 42, 'early_stopping_rounds': 5...","{'need_norm_data': False, 'early_stopping': Fa...",SumEncoder,TargetEncoder
3,0.9132,0.9132,0,LightGBM,"{'random_seed': 42, 'early_stopping_rounds': 5...","{'need_norm_data': False, 'early_stopping': Fa...",OrdinalEncoder,JamesSteinEncoder
13,0.9132,0.9132,0,LightGBM,"{'random_seed': 42, 'early_stopping_rounds': 5...","{'need_norm_data': False, 'early_stopping': Fa...",OneHotEncoder,CatBoostEncoder


**Encoder selection is an important part of all AutoML**    
Even with such smart preprocessing, don't forget the basic DS rule: **Garbage in the input is garbage in the output.**

[RUS] Даже с такой умной предобработкой не забывайте: Мусор на входе - мусор на выходе.