In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install -U -q automl-alex

In [1]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

import automl_alex
from automl_alex import DataPrepare

print(automl_alex.__version__)

1.02.15


In [2]:
RANDOM_SEED = 42

# Load Data

In [3]:
dataset = fetch_openml(name='adult', version=1, as_frame=True)
# convert target to binary
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [4]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.2, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((39073, 14), (9769, 14))

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39073 entries, 37193 to 15795
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             39073 non-null  category
 1   workclass       36851 non-null  category
 2   fnlwgt          39073 non-null  float64 
 3   education       39073 non-null  category
 4   education-num   39073 non-null  float64 
 5   marital-status  39073 non-null  category
 6   occupation      36842 non-null  category
 7   relationship    39073 non-null  category
 8   race            39073 non-null  category
 9   sex             39073 non-null  category
 10  capitalgain     39073 non-null  category
 11  capitalloss     39073 non-null  category
 12  hoursperweek    39073 non-null  category
 13  native-country  38396 non-null  category
dtypes: category(12), float64(2)
memory usage: 1.3 MB


As we can see, the data is quite dirty, there are object/category features and nans. But the **model is successfully trained even in such a dirty dataset**      
[RUS] Как мы видим, данные довольно грязные, есть object/category признаки и nans. Но модель успешно обучаеться даже таком грязном датасете

# Data Cleaning (DataPrepare)
before entering the AutoML, the data goes through a full cycle of pre-processing in DataPrepare Class     
[RUS] до того как попасть в AutoML, данные проходят полный цикл предобработки в классе DataPrepare. Давайте расмотрим его отдельно

In [6]:
de = DataPrepare(clean_and_encod_data=True,
                #cat_encoder_names=['HelmertEncoder','CountEncoder'], # Encoders list for Generator cat encodet features
                clean_nan=True, # fillnan
                clean_outliers=True, # method='IQR', threshold=1.5,
                drop_invariant=True, # drop invariant features (data.nunique < 2)
                num_generator_features=True, # Generator interaction Num Features
                normalization=True, # normalization data (StandardScaler)
                cat_features=None, # DataPrepare can auto detect categorical features
                random_state=RANDOM_SEED)
clean_X_train = de.fit_transform(X_train)

Source data shape:  (39073, 14)
##################################################
! START preprocessing Data
- Auto detect cat features:  12
> Binary Features
> Clean Categorical Features
> Transform Categorical Features.
 - Encoder: HelmertEncoder ADD features: 123
 - Encoder: HashingEncoder ADD features: 12
Num of outlier detected: 560 in Feature fnlwgt
Proportion of outlier detected: 1.4 %
Num of outlier detected: 253 in Feature education-num
Proportion of outlier detected: 0.6 %
  No nans features
> Generate interaction Num Features
 ADD features: 3
> Normalization Features
##################################################
Final data shape:  (39073, 154)
Total ADD columns: 140
Memory usage of dataframe is 45.91 MB
Memory usage after optimization is: 11.48 MB
Decreased by 75.0%


In [7]:
clean_X_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,HashingEncoder_race,HashingEncoder_capitalgain,HashingEncoder_capitalloss,HashingEncoder_hoursperweek,HashingEncoder_native-country,fnlwgt_Is_Outliers_IQR,education-num_Is_Outliers_IQR,fnlwgt_/_education-num,fnlwgt_*_education-num,fnlwgt_-_education-num
0,-1.213867,-0.578613,-1.393555,-0.994141,-0.42749,-0.938477,-1.571289,-0.973145,-0.343994,0.70459,...,-0.343994,-0.270752,-0.19812,-0.771973,-0.224609,0.0,0.0,-0.993652,-1.287109,-1.393555
1,-0.540527,0.00898,-0.445557,-0.737793,1.541016,-0.938477,-1.293945,-0.973145,-0.343994,0.70459,...,-0.343994,-0.270752,-0.19812,-0.771973,-0.224609,0.0,0.0,-0.68457,0.118225,-0.445557
2,-1.213867,0.59668,0.647461,-0.481689,-0.820801,-0.938477,-1.015625,-0.973145,-0.343994,0.70459,...,-0.343994,-0.270752,-0.19812,-0.771973,-0.224609,0.0,0.0,0.719238,0.115356,0.647461
3,0.132446,-0.578613,-1.203125,-0.994141,-0.42749,-0.259766,-0.737305,-0.356934,-0.343994,0.70459,...,-0.343994,-0.270752,-0.19812,-0.124207,-0.224609,0.0,0.0,-0.855469,-1.134766,-1.203125
4,0.132446,1.183594,-1.618164,-0.994141,-0.42749,0.419189,-0.459229,0.259277,-0.343994,0.70459,...,-0.343994,-0.270752,-0.19812,0.523438,-0.224609,0.0,0.0,-1.15625,-1.46582,-1.618164


<img src="./img/magic.gif" width="400">

In [8]:
clean_X_test = de.transform(X_test)

##################################################
! Start Transform Data
> Clean Binary Features
> Clean Categorical Features
> Transform Categorical Features.
 - Encoder: HelmertEncoder ADD features: 123
 - Encoder: HashingEncoder ADD features: 12
> Generate interaction Num Features
 ADD features: 3
> Normalization Features
##################################################
Final data shape:  (9769, 154)
Total ADD columns: 140
Memory usage of dataframe is 11.48 MB
Memory usage after optimization is: 2.87 MB
Decreased by 75.0%


In [9]:
clean_X_test.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,HashingEncoder_race,HashingEncoder_capitalgain,HashingEncoder_capitalloss,HashingEncoder_hoursperweek,HashingEncoder_native-country,fnlwgt_Is_Outliers_IQR,education-num_Is_Outliers_IQR,fnlwgt_/_education-num,fnlwgt_*_education-num,fnlwgt_-_education-num
0,1.478516,-0.578613,2.378906,-0.994141,-0.42749,1.097656,-0.181274,0.259277,-0.343994,0.70459,...,-0.343994,-0.270752,-0.19812,-0.124207,-0.224609,0.0,0.0,1.743164,1.722656,2.378906
1,1.478516,-0.578613,-0.094849,-0.481689,-0.820801,1.097656,-0.459229,1.492188,-0.343994,-1.419922,...,-0.343994,-0.270752,-0.19812,-0.124207,-0.224609,0.0,0.0,0.113342,-0.411133,-0.094849
2,1.478516,0.59668,1.62793,-0.994141,-0.42749,1.097656,0.931152,0.875488,1.069336,0.70459,...,1.069336,-0.270752,-0.19812,-0.771973,-0.224609,0.0,0.0,1.199219,1.124023,1.62793
3,1.478516,-0.578613,0.302979,0.286621,-0.033783,1.097656,-0.459229,1.492188,-0.343994,-1.419922,...,-0.343994,-0.270752,-0.19812,1.818359,-0.224609,0.0,0.0,0.077087,0.263184,0.302979
4,0.805664,-0.578613,0.564453,-0.994141,-0.42749,1.097656,1.208984,-0.356934,5.308594,0.70459,...,5.308594,-0.270752,-0.19812,1.170898,0.986816,0.0,0.0,0.427246,0.275635,0.564453


## Save and Load
separate transform allows us to save and transfer processing to new data 
[RUS] раздельный transform позволяет нам сохранять и переносить обработку на новые данные

In [10]:
de.save('de')

Save DataPrepare


In [11]:
de = DataPrepare()
de = de.load('de')

In [12]:
de.transform(X_test).head(5)

##################################################
! Start Transform Data
> Clean Binary Features
> Clean Categorical Features
> Transform Categorical Features.
 - Encoder: HelmertEncoder ADD features: 123
 - Encoder: HashingEncoder ADD features: 12
> Generate interaction Num Features
 ADD features: 3
> Normalization Features
##################################################
Final data shape:  (9769, 154)
Total ADD columns: 140
Memory usage of dataframe is 11.48 MB
Memory usage after optimization is: 2.87 MB
Decreased by 75.0%


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,HashingEncoder_race,HashingEncoder_capitalgain,HashingEncoder_capitalloss,HashingEncoder_hoursperweek,HashingEncoder_native-country,fnlwgt_Is_Outliers_IQR,education-num_Is_Outliers_IQR,fnlwgt_/_education-num,fnlwgt_*_education-num,fnlwgt_-_education-num
0,1.478516,-0.578613,2.378906,-0.994141,-0.42749,1.097656,-0.181274,0.259277,-0.343994,0.70459,...,-0.343994,-0.270752,-0.19812,-0.124207,-0.224609,0.0,0.0,1.743164,1.722656,2.378906
1,1.478516,-0.578613,-0.094849,-0.481689,-0.820801,1.097656,-0.459229,1.492188,-0.343994,-1.419922,...,-0.343994,-0.270752,-0.19812,-0.124207,-0.224609,0.0,0.0,0.113342,-0.411133,-0.094849
2,1.478516,0.59668,1.62793,-0.994141,-0.42749,1.097656,0.931152,0.875488,1.069336,0.70459,...,1.069336,-0.270752,-0.19812,-0.771973,-0.224609,0.0,0.0,1.199219,1.124023,1.62793
3,1.478516,-0.578613,0.302979,0.286621,-0.033783,1.097656,-0.459229,1.492188,-0.343994,-1.419922,...,-0.343994,-0.270752,-0.19812,1.818359,-0.224609,0.0,0.0,0.077087,0.263184,0.302979
4,0.805664,-0.578613,0.564453,-0.994141,-0.42749,1.097656,1.208984,-0.356934,5.308594,0.70459,...,5.308594,-0.270752,-0.19812,1.170898,0.986816,0.0,0.0,0.427246,0.275635,0.564453


# Feature Engineering

## Categorical Features 
### Encoders

In [13]:
# available Encoders:
automl_alex.encoders.cat_encoders_names

{'HashingEncoder': category_encoders.hashing.HashingEncoder,
 'SumEncoder': category_encoders.sum_coding.SumEncoder,
 'BackwardDifferenceEncoder': category_encoders.backward_difference.BackwardDifferenceEncoder,
 'OneHotEncoder': category_encoders.one_hot.OneHotEncoder,
 'HelmertEncoder': category_encoders.helmert.HelmertEncoder,
 'BaseNEncoder': category_encoders.basen.BaseNEncoder,
 'CountEncoder': category_encoders.count.CountEncoder}

you can use any encoders for categorical features, combining their results (if you have enough memory)

In [14]:
de = DataPrepare(
    cat_encoder_names=['HelmertEncoder','OneHotEncoder','HashingEncoder'], # Encoders list for Generator cat encodet features
    )  
clean_X_train = de.fit_transform(X_train)

Source data shape:  (39073, 14)
##################################################
! START preprocessing Data
- Auto detect cat features:  12
> Binary Features
> Clean Categorical Features
> Transform Categorical Features.
 - Encoder: HelmertEncoder ADD features: 123
 - Encoder: OneHotEncoder ADD features: 135
 - Encoder: HashingEncoder ADD features: 12
Num of outlier detected: 560 in Feature fnlwgt
Proportion of outlier detected: 1.4 %
Num of outlier detected: 253 in Feature education-num
Proportion of outlier detected: 0.6 %
  No nans features
> Generate interaction Num Features
 ADD features: 3
> Normalization Features
##################################################
Final data shape:  (39073, 289)
Total ADD columns: 275
Memory usage of dataframe is 86.15 MB
Memory usage after optimization is: 21.54 MB
Decreased by 75.0%


## Numerical Features

In [15]:
# add dataset whis more num features
dataset = fetch_openml(name='credit-g', version=1, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes

X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=RANDOM_SEED,)
X_train.head(3)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
29,<0,60.0,delayed previously,business,6836.0,<100,>=7,3.0,male single,none,4.0,no known property,63.0,none,own,2.0,skilled,1.0,yes,yes
535,>=200,21.0,critical/other existing credit,education,2319.0,<100,<1,2.0,male div/sep,none,1.0,car,33.0,none,rent,1.0,skilled,1.0,none,yes
695,no checking,6.0,existing paid,used car,1236.0,500<=X<1000,1<=X<4,2.0,male single,none,4.0,life insurance,50.0,none,rent,1.0,skilled,1.0,none,yes


### Generator interaction Num Features
Numerical interaction generator features: A/B, A*B, A-B, A+B

In [16]:
de = DataPrepare(
                #cat_encoder_names=[],
                num_generator_features=True, # Generator interaction Num Features
                normalization=False, # normalization data (StandardScaler)
                )
clean_X_train = de.fit_transform(X_train)

Source data shape:  (800, 20)
##################################################
! START preprocessing Data
- Auto detect cat features:  13
> Binary Features
> Clean Categorical Features
> Transform Categorical Features.
 - Encoder: HelmertEncoder ADD features: 39
 - Encoder: HashingEncoder ADD features: 13
Num of outlier detected: 10 in Feature duration
Proportion of outlier detected: 1.2 %
Num of outlier detected: 6 in Feature age
Proportion of outlier detected: 0.8 %
Num of outlier detected: 38 in Feature credit_amount
Proportion of outlier detected: 4.8 %
  No nans features
> Generate interaction Num Features
 ADD features: 45
##################################################
Final data shape:  (800, 123)
Total ADD columns: 103
Memory usage of dataframe is 0.30 MB
Memory usage after optimization is: 0.17 MB
Decreased by 44.2%


In [17]:
clean_X_train.head(5)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,age_-_installment_commitment,credit_amount_/_existing_credits,credit_amount_*_existing_credits,credit_amount_-_existing_credits,credit_amount_/_installment_commitment,credit_amount_*_installment_commitment,credit_amount_-_installment_commitment,existing_credits_/_installment_commitment,existing_credits_*_installment_commitment,existing_credits_-_installment_commitment
0,1,48.0,1,1,6836.0,1,1,3.0,1,1,...,60.0,3418.0,13672.0,6832.0,2278.0,20512.0,6832.0,0.666504,6.0,-1.0
1,2,21.0,2,2,2320.0,1,2,2.0,2,1,...,31.0,2320.0,2320.0,2318.0,1160.0,4640.0,2316.0,0.5,2.0,-1.0
2,3,6.0,3,3,1236.0,2,3,2.0,1,1,...,48.0,1236.0,1236.0,1235.0,618.0,2472.0,1234.0,0.5,2.0,-1.0
3,3,21.0,4,4,5004.0,3,3,1.0,3,1,...,28.0,2502.0,10008.0,5000.0,5004.0,5004.0,5000.0,2.0,2.0,1.0
4,3,12.0,3,5,886.0,3,3,4.0,3,1,...,17.0,886.0,886.0,885.0,221.5,3544.0,882.0,0.25,4.0,-3.0


**Encoder selection is an important part of all AutoML**    
Even with such smart preprocessing, don't forget the basic DS rule: **Garbage in the input is garbage in the output.**

[RUS] Даже с такой умной предобработкой не забывайте: Мусор на входе - мусор на выходе.

<img src="./img/data-cleaning.png" width="500">