In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install automl-alex

In [2]:
import sklearn
from sklearn.model_selection import train_test_split

import automl_alex
from automl_alex import LightGBMClassifier, DataBunch

print(automl_alex.__version__)

0.05.06.23


In [3]:
RANDOM_SEED = 42

# Load Data

In [4]:
from sklearn.datasets import fetch_openml

In [5]:
dataset = fetch_openml(name='adult', version=1, as_frame=True)
dataset.data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [6]:
# convert target to binary
dataset.target = dataset.target.astype('category').cat.codes

In [7]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.25, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((36631, 14), (12211, 14))

# Data Cleaning and Encoding (DataBunch)

In [8]:
X_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
27859,3,Private,304857.0,Masters,14.0,Separated,Tech-support,Not-in-family,White,Male,4,0,2,United-States
5654,0,Private,189590.0,Bachelors,13.0,Never-married,Tech-support,Not-in-family,White,Male,0,0,2,United-States
3779,4,Private,96299.0,HS-grad,9.0,Divorced,Transport-moving,Unmarried,White,Male,0,0,2,United-States
10522,4,Self-emp-not-inc,196307.0,Bachelors,13.0,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,0,United-States
22461,0,Private,265434.0,Some-college,10.0,Never-married,Prof-specialty,Own-child,White,Female,0,0,1,United-States


In [9]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36631 entries, 27859 to 15795
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             36631 non-null  category
 1   workclass       34534 non-null  category
 2   fnlwgt          36631 non-null  float64 
 3   education       36631 non-null  category
 4   education-num   36631 non-null  float64 
 5   marital-status  36631 non-null  category
 6   occupation      34525 non-null  category
 7   relationship    36631 non-null  category
 8   race            36631 non-null  category
 9   sex             36631 non-null  category
 10  capitalgain     36631 non-null  category
 11  capitalloss     36631 non-null  category
 12  hoursperweek    36631 non-null  category
 13  native-country  35993 non-null  category
dtypes: category(12), float64(2)
memory usage: 1.3 MB


As we can see, the data is quite dirty, there are object/category features and nans. But the **model is successfully trained even in such a dirty dataset**

[RUS] Как мы видим, данные довольно грязные, есть object/category признаки и nans. Но модель успешно обучаеться даже таком грязном датасете:

In [10]:
model = LightGBMClassifier(X_train, y_train, X_test, random_state=RANDOM_SEED)

In [11]:
# fit with default model parameters
predict_test, predict_train = model.fit_predict()
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predict_test),4))

Mean Score roc_auc_score on 10 Folds: 0.9117 std: 0.006075
Test AUC:  0.9105


**How is this possible?**      
[RUS] как это возможно?

<img src="./img/magic.gif" width="400">

## DataBunch
before entering the model, the data goes through a full cycle of pre-processing in DataBunch     
[RUS] до того как попасть в модель, данные проходят полный цикл предобработки.

In [12]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_name='OneHotEncoder',
                target_encoder_name=None,
                clean_nan=True, # fillnan
                cat_features=None, # DataBunch can auto detect categorical features
                random_state=RANDOM_SEED)

In [13]:
data.X_train.head(5)

Unnamed: 0,fnlwgt,OneHotEncoder_capitalgain_1,OneHotEncoder_capitalgain_2,OneHotEncoder_capitalgain_3,OneHotEncoder_capitalgain_4,OneHotEncoder_capitalgain_5,OneHotEncoder_relationship_1,OneHotEncoder_relationship_2,OneHotEncoder_relationship_3,OneHotEncoder_relationship_4,...,OneHotEncoder_native-country_32,OneHotEncoder_native-country_33,OneHotEncoder_native-country_34,OneHotEncoder_native-country_35,OneHotEncoder_native-country_36,OneHotEncoder_native-country_37,OneHotEncoder_native-country_38,OneHotEncoder_native-country_39,OneHotEncoder_native-country_40,OneHotEncoder_native-country_41
0,304857.0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,189590.0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,96299.0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,196307.0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,265434.0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Encoders

In [14]:
# available Encoders:
automl_alex.encoders.encoders_names

{'HashingEncoder': category_encoders.hashing.HashingEncoder,
 'SumEncoder': category_encoders.sum_coding.SumEncoder,
 'OneHotEncoder': category_encoders.one_hot.OneHotEncoder,
 'HelmertEncoder': category_encoders.helmert.HelmertEncoder,
 'OrdinalEncoder': category_encoders.ordinal.OrdinalEncoder,
 'FrequencyEncoder': automl_alex.encoders.FrequencyEncoder}

In [15]:
# available Target Encoders:
automl_alex.encoders.target_encoders_names

{'TargetEncoder': category_encoders.target_encoder.TargetEncoder,
 'CatBoostEncoder': category_encoders.cat_boost.CatBoostEncoder,
 'JamesSteinEncoder': category_encoders.james_stein.JamesSteinEncoder}

In [16]:
data = DataBunch(X_train=X_train, 
                y_train=y_train,
                X_test=X_test, # be sure to specify X_test, because the encoder needs all dataset to work.
                clean_and_encod_data=True,
                cat_encoder_name='HelmertEncoder',
                target_encoder_name='JamesSteinEncoder',
                clean_nan=True, # fillnan
                #cat_features=categorical_features, # DataBunch can detect categorical features itself.
                random_state=RANDOM_SEED)
data.X_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,HelmertEncoder_native-country_31,HelmertEncoder_native-country_32,HelmertEncoder_native-country_33,HelmertEncoder_native-country_34,HelmertEncoder_native-country_35,HelmertEncoder_native-country_36,HelmertEncoder_native-country_37,HelmertEncoder_native-country_38,HelmertEncoder_native-country_39,HelmertEncoder_native-country_40
0,3,Private,304857.0,Masters,14.0,Separated,Tech-support,Not-in-family,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,0,Private,189590.0,Bachelors,13.0,Never-married,Tech-support,Not-in-family,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,4,Private,96299.0,HS-grad,9.0,Divorced,Transport-moving,Unmarried,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,4,Self-emp-not-inc,196307.0,Bachelors,13.0,Never-married,Prof-specialty,Not-in-family,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,0,Private,265434.0,Some-college,10.0,Never-married,Prof-specialty,Own-child,White,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


**Please note:** *Target Encoders processing takes place **inside cross-validation**. This shows unprocessed features after adding Target Encoders*

[RUS] *Обратите внимание что обработка Target Encoder происходит внутри кроссвалидации. По этому видны не обработанные признаки после добавления Target Encoders.*

## Model databunch

In [17]:
# After you can pass databunch in model
model = LightGBMClassifier(databunch=data, random_state=RANDOM_SEED)
model._data.X_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,HelmertEncoder_native-country_31,HelmertEncoder_native-country_32,HelmertEncoder_native-country_33,HelmertEncoder_native-country_34,HelmertEncoder_native-country_35,HelmertEncoder_native-country_36,HelmertEncoder_native-country_37,HelmertEncoder_native-country_38,HelmertEncoder_native-country_39,HelmertEncoder_native-country_40
0,3,Private,304857.0,Masters,14.0,Separated,Tech-support,Not-in-family,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,0,Private,189590.0,Bachelors,13.0,Never-married,Tech-support,Not-in-family,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,4,Private,96299.0,HS-grad,9.0,Divorced,Transport-moving,Unmarried,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,4,Self-emp-not-inc,196307.0,Bachelors,13.0,Never-married,Prof-specialty,Not-in-family,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,0,Private,265434.0,Some-college,10.0,Never-married,Prof-specialty,Own-child,White,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [18]:
# or you can specify all DataBunch settings in model
model = LightGBMClassifier(X_train, 
                            y_train, 
                            X_test,
                            cat_features=None,
                            clean_and_encod_data=True,
                            cat_encoder_name='HelmertEncoder',
                            target_encoder_name='JamesSteinEncoder',
                            clean_nan=True,
                            random_state=RANDOM_SEED)
model._data.X_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,HelmertEncoder_native-country_31,HelmertEncoder_native-country_32,HelmertEncoder_native-country_33,HelmertEncoder_native-country_34,HelmertEncoder_native-country_35,HelmertEncoder_native-country_36,HelmertEncoder_native-country_37,HelmertEncoder_native-country_38,HelmertEncoder_native-country_39,HelmertEncoder_native-country_40
0,3,Private,304857.0,Masters,14.0,Separated,Tech-support,Not-in-family,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,0,Private,189590.0,Bachelors,13.0,Never-married,Tech-support,Not-in-family,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,4,Private,96299.0,HS-grad,9.0,Divorced,Transport-moving,Unmarried,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,4,Self-emp-not-inc,196307.0,Bachelors,13.0,Never-married,Prof-specialty,Not-in-family,White,1,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,0,Private,265434.0,Some-college,10.0,Never-married,Prof-specialty,Own-child,White,0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


If you need to make changes to the data, you can access the databunch directly in the model model._data.X_train. But I do not recommend doing this.

In [19]:
# fit with default model parameters
predict_test, predict_train = model.fit_predict()
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predict_test),4))

Mean Score roc_auc_score on 10 Folds: 0.9117 std: 0.006178
Test AUC:  0.9108


**Encoder selection is an important part of all AutoML**    
Even with such smart preprocessing, don't forget the basic DS rule: **Garbage in the input is garbage in the output.**

[RUS] Даже с такой умной предобработкой не забывайте: Мусор на входе - мусор на выходе.