In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install -U -q automl-alex

In [1]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

import automl_alex
from automl_alex import DataPrepare

print(automl_alex.__version__)

1.02.15


In [None]:
RANDOM_SEED = 42

# Load Data

In [3]:
dataset = fetch_openml(name='adult', version=1, as_frame=True)
# convert target to binary
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [4]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.2, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((39073, 14), (9769, 14))

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 37193 to 15795
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             39073 non-null  category
 1   workclass       36851 non-null  category
 2   fnlwgt          39073 non-null  float64 
 3   education       39073 non-null  category
 4   education-num   39073 non-null  float64 
 5   marital-status  39073 non-null  category
 6   occupation      36842 non-null  category
 7   relationship    39073 non-null  category
 8   race            39073 non-null  category
 9   sex             39073 non-null  category
 10  capitalgain     39073 non-null  category
 11  capitalloss     39073 non-null  category
 12  hoursperweek    39073 non-null  category
 13  native-country  38396 non-null  category
dtypes: category(12), float64(2)
memory usage: 1.3 MB


As we can see, the data is quite dirty, there are object/category features and nans. But the **model is successfully trained even in such a dirty dataset**      
[RUS] Как мы видим, данные довольно грязные, есть object/category признаки и nans. Но модель успешно обучаеться даже таком грязном датасете

# Data Cleaning (DataPrepare)
before entering the AutoML, the data goes through a full cycle of pre-processing in DataPrepare Class     
[RUS] до того как попасть в AutoML, данные проходят полный цикл предобработки в классе DataPrepare. Давайте расмотрим его отдельно

In [6]:
de = DataPrepare(clean_and_encod_data=True,
                cat_encoder_names=['HelmertEncoder','CountEncoder'], # Encoders list for Generator cat encodet features
                clean_nan=True, # fillnan
                drop_invariant=True, # drop invariant features (data.nunique < 2)
                num_generator_features=True, # Generator interaction Num Features
                normalization=True, # normalization data (StandardScaler)
                cat_features=None, # DataPrepare can auto detect categorical features
                random_state=RANDOM_SEED)
clean_X_train = de.fit_transform(X_train)

Source data shape:  (39073, 14)
##################################################
! START preprocessing Data
- Auto detect cat features:  12
> Binary Features
> Clean Categorical Features
> Transform Categorical Features.
 - Encoder: HelmertEncoder ADD features: 123
 - Encoder: CountEncoder ADD features: 12
  No nans features
> Generate interaction Num Features
 ADD features: 4
> Normalization Features
##################################################
Final data shape:  (39073, 153)
Total ADD columns: 139


In [7]:
clean_X_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,CountEncoder_relationship,CountEncoder_race,CountEncoder_capitalgain,CountEncoder_capitalloss,CountEncoder_hoursperweek,CountEncoder_native-country,education-num_/_fnlwgt,education-num_*_fnlwgt,education-num_-_fnlwgt,education-num_+_fnlwgt
0,-1.213983,-0.578503,-1.315975,-0.994139,-0.420108,-0.938503,-1.571706,-0.973272,-0.343966,0.704448,...,1.066668,0.410588,0.302964,0.22103,0.859071,0.33534,1.279547,-1.224664,1.315963,-1.315986
1,-0.540769,0.008976,-0.431457,-0.737964,1.529031,-0.938503,-1.293613,-0.973272,-0.343966,0.704448,...,1.066668,0.410588,0.302964,0.22103,0.859071,0.33534,0.22328,0.098973,0.431494,-0.431421
2,-1.213983,0.596455,0.587895,-0.48179,-0.809935,-0.938503,-1.01552,-0.973272,-0.343966,0.704448,...,1.066668,0.410588,0.302964,0.22103,0.859071,0.33534,-0.634778,0.096284,-0.587914,0.587876
3,0.132444,-0.578503,-1.138576,-0.994139,-0.420108,-0.259678,-0.737427,-0.356984,-0.343966,0.704448,...,-1.286459,0.410588,0.302964,0.22103,-1.335637,0.33534,0.65008,-1.081693,1.138565,-1.138587
4,0.132444,1.183934,-1.525211,-0.994139,-0.420108,0.419147,-0.459334,0.259304,-0.343966,0.704448,...,-0.076981,0.410588,0.302964,0.22103,-1.59887,0.33534,3.083612,-1.393293,1.525199,-1.525223


<img src="./img/magic.gif" width="400">

In [8]:
clean_X_test = de.transform(X_test)

##################################################
! Start Transform Data
> Clean Binary Features
> Clean Categorical Features
> Transform Categorical Features.
 - Encoder: HelmertEncoder ADD features: 123
 - Encoder: CountEncoder ADD features: 12
> Generate interaction Num Features
 ADD features: 4
> Normalization Features
##################################################
Final data shape:  (9769, 153)
Total ADD columns: 139


In [9]:
clean_X_test.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,CountEncoder_relationship,CountEncoder_race,CountEncoder_capitalgain,CountEncoder_capitalloss,CountEncoder_hoursperweek,CountEncoder_native-country,education-num_/_fnlwgt,education-num_*_fnlwgt,education-num_-_fnlwgt,education-num_+_fnlwgt
0,1.478871,-0.578503,2.202051,-0.994139,-0.420108,1.097972,-0.181241,0.259304,-0.343966,0.704448,...,-0.076981,0.410588,0.302964,0.22103,-1.335637,0.33534,-0.772076,1.610616,-2.202059,2.202043
1,1.478871,-0.578503,-0.104463,-0.48179,-0.809935,1.097972,-0.459334,1.49188,-0.343966,-1.419551,...,-0.871154,0.410588,0.302964,0.22103,-1.335637,0.33534,-0.464056,-0.399708,0.104443,-0.104482
2,1.478871,0.596455,1.502379,-0.994139,-0.420108,1.097972,0.931131,0.875592,1.069113,0.704448,...,-1.866034,-2.340468,0.302964,0.22103,0.859071,0.33534,-0.712736,1.04673,-1.502387,1.50237
3,1.478871,-0.578503,0.26657,0.286733,-0.03028,1.097972,-0.459334,1.49188,-0.343966,-1.419551,...,-0.871154,0.410588,0.302964,0.22103,-1.204548,0.33534,-0.449307,0.235418,-0.266571,0.26657
4,0.805658,-0.578503,0.510479,-0.994139,-0.420108,1.097972,1.209224,-0.356984,5.308352,0.704448,...,-1.286459,-2.651793,0.302964,0.22103,-0.914557,-2.997947,-0.566904,0.247329,-0.510489,0.510469


## Save and Load
separate transform allows us to save and transfer processing to new data 
[RUS] раздельный transform позволяет нам сохранять и переносить обработку на новые данные

In [10]:
de.save('de')

Save DataPrepare


In [11]:
de = DataPrepare()
de = de.load('de')

In [12]:
de.transform(X_test).head(5)

##################################################
! Start Transform Data
> Clean Binary Features
> Clean Categorical Features
> Transform Categorical Features.
 - Encoder: HelmertEncoder ADD features: 123
 - Encoder: CountEncoder ADD features: 12
> Generate interaction Num Features
 ADD features: 4
> Normalization Features
##################################################
Final data shape:  (9769, 153)
Total ADD columns: 139


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,CountEncoder_relationship,CountEncoder_race,CountEncoder_capitalgain,CountEncoder_capitalloss,CountEncoder_hoursperweek,CountEncoder_native-country,education-num_/_fnlwgt,education-num_*_fnlwgt,education-num_-_fnlwgt,education-num_+_fnlwgt
0,1.478871,-0.578503,2.202051,-0.994139,-0.420108,1.097972,-0.181241,0.259304,-0.343966,0.704448,...,-0.076981,0.410588,0.302964,0.22103,-1.335637,0.33534,-0.772076,1.610616,-2.202059,2.202043
1,1.478871,-0.578503,-0.104463,-0.48179,-0.809935,1.097972,-0.459334,1.49188,-0.343966,-1.419551,...,-0.871154,0.410588,0.302964,0.22103,-1.335637,0.33534,-0.464056,-0.399708,0.104443,-0.104482
2,1.478871,0.596455,1.502379,-0.994139,-0.420108,1.097972,0.931131,0.875592,1.069113,0.704448,...,-1.866034,-2.340468,0.302964,0.22103,0.859071,0.33534,-0.712736,1.04673,-1.502387,1.50237
3,1.478871,-0.578503,0.26657,0.286733,-0.03028,1.097972,-0.459334,1.49188,-0.343966,-1.419551,...,-0.871154,0.410588,0.302964,0.22103,-1.204548,0.33534,-0.449307,0.235418,-0.266571,0.26657
4,0.805658,-0.578503,0.510479,-0.994139,-0.420108,1.097972,1.209224,-0.356984,5.308352,0.704448,...,-1.286459,-2.651793,0.302964,0.22103,-0.914557,-2.997947,-0.566904,0.247329,-0.510489,0.510469


# Feature Engineering

## Categorical Features 
### Encoders

In [13]:
# available Encoders:
automl_alex.encoders.cat_encoders_names

{'HashingEncoder': category_encoders.hashing.HashingEncoder,
 'SumEncoder': category_encoders.sum_coding.SumEncoder,
 'BackwardDifferenceEncoder': category_encoders.backward_difference.BackwardDifferenceEncoder,
 'OneHotEncoder': category_encoders.one_hot.OneHotEncoder,
 'HelmertEncoder': category_encoders.helmert.HelmertEncoder,
 'BaseNEncoder': category_encoders.basen.BaseNEncoder,
 'CountEncoder': category_encoders.count.CountEncoder}

you can use any encoders for categorical features, combining their results (if you have enough memory)

In [14]:
de = DataPrepare(
    cat_encoder_names=['HelmertEncoder','OneHotEncoder','CountEncoder','HashingEncoder'], # Encoders list for Generator cat encodet features
    )  
clean_X_train = de.fit_transform(X_train)

Source data shape:  (39073, 14)
##################################################
! START preprocessing Data
- Auto detect cat features:  12
> Binary Features
> Clean Categorical Features
> Transform Categorical Features.
 - Encoder: HelmertEncoder ADD features: 123
 - Encoder: OneHotEncoder ADD features: 135
 - Encoder: CountEncoder ADD features: 12
 - Encoder: HashingEncoder ADD features: 12
  No nans features
> Generate interaction Num Features
 ADD features: 4
> Normalization Features
##################################################
Final data shape:  (39073, 300)
Total ADD columns: 286


## Numerical Features

In [17]:
# add dataset whis more num features
dataset = fetch_openml(name='credit-g', version=1, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes

X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=RANDOM_SEED,)
X_train.head(3)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
29,<0,60.0,delayed previously,business,6836.0,<100,>=7,3.0,male single,none,4.0,no known property,63.0,none,own,2.0,skilled,1.0,yes,yes
535,>=200,21.0,critical/other existing credit,education,2319.0,<100,<1,2.0,male div/sep,none,1.0,car,33.0,none,rent,1.0,skilled,1.0,none,yes
695,no checking,6.0,existing paid,used car,1236.0,500<=X<1000,1<=X<4,2.0,male single,none,4.0,life insurance,50.0,none,rent,1.0,skilled,1.0,none,yes


### Generator interaction Num Features
Numerical interaction generator features: A/B, A*B, A-B, A+B

In [21]:
de = DataPrepare(
                cat_encoder_names=[],
                num_generator_features=True, # Generator interaction Num Features
                normalization=False, # normalization data (StandardScaler)
                )
clean_X_train = de.fit_transform(X_train)

Source data shape:  (800, 20)
##################################################
! START preprocessing Data
- Auto detect cat features:  13
> Binary Features
> Clean Categorical Features
> Transform Categorical Features.
  No nans features
> Generate interaction Num Features
 ADD features: 60
##################################################
Final data shape:  (800, 80)
Total ADD columns: 60


In [24]:
clean_X_train.head(5)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,duration_-_age,duration_+_age,duration_/_existing_credits,duration_*_existing_credits,duration_-_existing_credits,duration_+_existing_credits,age_/_existing_credits,age_*_existing_credits,age_-_existing_credits,age_+_existing_credits
0,1,60.0,1,1,6836.0,1,1,3.0,1,1,...,-3.0,123.0,30.0,120.0,58.0,62.0,31.5,126.0,61.0,65.0
1,2,21.0,2,2,2319.0,1,2,2.0,2,1,...,-12.0,54.0,21.0,21.0,20.0,22.0,33.0,33.0,32.0,34.0
2,3,6.0,3,3,1236.0,2,3,2.0,1,1,...,-44.0,56.0,6.0,6.0,5.0,7.0,50.0,50.0,49.0,51.0
3,3,21.0,4,4,5003.0,3,3,1.0,3,1,...,-8.0,50.0,10.5,42.0,19.0,23.0,14.5,58.0,27.0,31.0
4,3,12.0,3,5,886.0,3,3,4.0,3,1,...,-9.0,33.0,12.0,12.0,11.0,13.0,21.0,21.0,20.0,22.0


**Encoder selection is an important part of all AutoML**    
Even with such smart preprocessing, don't forget the basic DS rule: **Garbage in the input is garbage in the output.**

[RUS] Даже с такой умной предобработкой не забывайте: Мусор на входе - мусор на выходе.

<img src="./img/data-cleaning.png" width="500">