In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install -q -U automl_alex

In [1]:
import automl_alex
import sklearn
import pandas as pd
import time
from automl_alex import AutoML, AutoMLClassifier, AutoMLRegressor
print('AutoML-Alex version:', automl_alex.__version__)

AutoML-Alex version: 0.11.24


In [2]:
RANDOM_SEED = 42

# Classifier

## Data

In [3]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
dataset = fetch_openml(name='credit-g', version=1, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,4.0,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,2.0,real estate,22.0,none,own,1.0,skilled,1.0,none,yes
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,3.0,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,4.0,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,4.0,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes


In [7]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.25, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((750, 20), (250, 20))

## Model

In [6]:
#model = AutoML(X_train, y_train, X_test, type_of_estimator='classifier', random_state=RANDOM_SEED)

# or Simply
model = AutoMLClassifier(X_train, y_train, X_test, random_state=RANDOM_SEED, verbose=1)

Source X_train shape:  (750, 20) | X_test shape:  (250, 20)
##################################################
Auto detect cat features:  13
> Start preprocessing Data
> Generate cat encodet features
 +  55  Features from  OneHotEncoder
 +  44  Features from  HelmertEncoder
 +  54  Features from  HashingEncoder
 +  16  Features from  FrequencyEncoder
> Generate Frequency Encode num features
 +  4  Frequency Encode Num Features 
> Clean Nans in num features
> Generate interaction Num Features
 +  24  Interaction Features
> Normalization Features
##################################################
> Total Features:  201
##################################################
New X_train shape:  (750, 201) | X_test shape:  (250, 201)


In [7]:
%%time
predict_test, predict_train = model.fit_predict(timeout=1500, verbose=2)

__________________________________________________
Step 1: Model 0
__________________________________________________
100%|██████████| 1/1 [00:04<00:00,  4.43s/it]
--------------------------------------------------
Model 1
One iteration takes ~ 1.1 sec
> Start Auto calibration parameters
[32m[I 2020-11-22 17:40:49,380][0m A new study created in memory with name: no-name-bc95aced-4166-4a42-b077-31385c3f2e68[0m
> Start optimization with the parameters:
CV_Folds =  10
Score_CV_Folds =  3
Feature_Selection =  True
Opt_lvl =  2
Cold_start =  55.0
Early_stoping =  100
Metric =  roc_auc_score
Direction =  maximize
##################################################
Default model OptScore = 0.6906
Optimize: : 184it [14:57,  4.88s/it,  | Model: RandomForest | OptScore: 0.7704 | Best roc_auc_score: 0.8097 +- 0.039278]

 Predict from Models_1
100%|██████████| 3/3 [00:06<00:00,  2.23s/it]
  0%|          | 0/4 [00:00<?, ?it/s]
 > Calc predict policy on Models_1:
 | posible_repeats:  8  | stack_to

In [8]:
predict_test[:5]

array([0.33952545, 0.48657555, 0.62185893, 0.45343977, 0.07799106])

In [9]:
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predict_test),4))

Test AUC:  0.8083


In [10]:
model.stack_models_cfgs

Unnamed: 0,score_opt,model_score,score_std,model_name,model_param,wrapper_params,cat_encoders,columns,cv_folds
0,0.7704,0.8097,0.039278,RandomForest,"{'verbose': 0, 'random_state': 42, 'n_jobs': -...",{},"[OneHotEncoder, HelmertEncoder, HashingEncoder...","(credit_amount, OneHotEncoder_personal_status_...",10
1,0.7703,0.8105,0.040204,RandomForest,"{'verbose': 0, 'random_state': 42, 'n_jobs': -...",{},"[OneHotEncoder, HelmertEncoder, HashingEncoder...","(credit_amount, OneHotEncoder_personal_status_...",10
2,0.7699,0.8097,0.039843,RandomForest,"{'verbose': 0, 'random_state': 42, 'n_jobs': -...",{},"[OneHotEncoder, HelmertEncoder, HashingEncoder...","(credit_amount, OneHotEncoder_personal_status_...",10
3,0.7695,0.8088,0.039275,RandomForest,"{'verbose': 0, 'random_state': 42, 'n_jobs': -...",{},"[OneHotEncoder, HelmertEncoder, HashingEncoder...","(credit_amount, OneHotEncoder_personal_status_...",10
4,0.7626,0.7688,0.006243,LinearModel,"{'fit_intercept': False, 'C': 72.39799446110504}",{},"[OneHotEncoder, HelmertEncoder, HashingEncoder...","(duration, age, num_dependents, OneHotEncoder_...",5
5,0.7626,0.7689,0.006349,LinearModel,"{'fit_intercept': False, 'C': 88.24233739381233}",{},"[OneHotEncoder, HelmertEncoder, HashingEncoder...","(duration, age, num_dependents, OneHotEncoder_...",5


# Regression

## Data

In [4]:
# https://www.openml.org/d/543
dataset = fetch_openml(data_id=543, as_frame=True)

X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(dataset.data), 
                                                    pd.DataFrame(dataset.target), 
                                                    test_size=0.15, 
                                                    random_state=RANDOM_SEED,)

X_train.shape, X_test.shape

((430, 19), (76, 19))

In [5]:
X_train

Unnamed: 0,TOWN,TOWN_ID,TRACT,LON,LAT,MEDV,CMEDV,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B
104,Medford,24.0,3395.0,-71.0690,42.2480,20.1,20.1,0.13960,0.0,8.56,0,0.5200,6.167,90.0,2.4210,5,384.0,20.9,392.69
203,Weston,37.0,3671.0,-71.1990,42.2320,48.5,48.5,0.03510,95.0,2.68,0,0.4161,7.853,33.2,5.1180,4,224.0,14.7,392.78
381,Boston_East_Boston,79.0,407.0,-71.0410,42.2290,10.9,10.9,15.87440,0.0,18.10,0,0.6710,6.545,99.1,1.5192,24,666.0,20.2,396.90
489,Chelsea,89.0,1602.0,-71.0228,42.2335,7.0,7.0,0.18337,0.0,27.74,0,0.6090,5.414,98.3,1.7554,4,711.0,20.1,344.05
69,Wilmington,16.0,3313.0,-71.1110,42.3270,20.9,20.9,0.12816,12.5,6.07,0,0.4090,5.885,33.0,6.4980,4,345.0,18.9,396.90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,Medford,24.0,3397.0,-71.0622,42.2431,19.5,19.5,0.17120,0.0,8.56,0,0.5200,5.836,91.9,2.2110,5,384.0,20.9,395.67
270,Dedham,46.0,4022.0,-71.0870,42.1410,21.1,21.1,0.29916,20.0,6.96,0,0.4640,5.856,42.1,4.4290,3,223.0,18.6,388.65
348,Norwell,69.0,5041.0,-70.9200,42.1016,24.5,24.5,0.01501,80.0,2.01,0,0.4350,6.635,29.7,8.3440,4,280.0,17.0,390.94
435,Boston_Savin_Hill,83.0,903.0,-71.0460,42.1867,13.4,13.4,11.16040,0.0,18.10,0,0.7400,6.629,94.6,2.1247,24,666.0,20.2,109.85


In [6]:
model = AutoMLRegressor(X_train, y_train, X_test, random_state=RANDOM_SEED, verbose=1)

Source X_train shape:  (430, 19) | X_test shape:  (76, 19)
##################################################
Auto detect cat features:  0
> Start preprocessing Data
> Generate cat encodet features
 +  102  Features from  OneHotEncoder
 +  100  Features from  HelmertEncoder
 +  100  Features from  HashingEncoder
 +  3  Features from  FrequencyEncoder
> Generate Frequency Encode num features
 +  16  Frequency Encode Num Features 
> Clean Nans in num features
> Generate interaction Num Features
 +  480  Interaction Features
> Normalization Features
##################################################
> Total Features:  817
##################################################
New X_train shape:  (430, 817) | X_test shape:  (76, 817)


In [7]:
predict_test, predict_train = model.fit_predict(timeout=1500, verbose=2)

__________________________________________________
Step 1: Model 0
__________________________________________________
100%|██████████| 1/1 [00:53<00:00, 53.13s/it]
--------------------------------------------------
Model 1
One iteration takes ~ 7.6 sec
> Start Auto calibration parameters
[32m[I 2020-11-23 09:53:46,319][0m A new study created in memory with name: no-name-8d331de2-19cd-4819-aaf2-353b151e5845[0m
> Start optimization with the parameters:
CV_Folds =  5
Score_CV_Folds =  2
Feature_Selection =  True
Opt_lvl =  1
Cold_start =  55.0
Early_stoping =  100
Metric =  mean_squared_error
Direction =  minimize
##################################################
Default model OptScore = 15.8712
Optimize: : 113it [14:08,  7.51s/it,  | Model: LightGBM | OptScore: 11.4918 | Best mean_squared_error: 10.2078 +- 1.284002]

 Predict from Models_1
100%|██████████| 3/3 [00:21<00:00,  7.04s/it]
  0%|          | 0/4 [00:00<?, ?it/s]
 > Calc predict policy on Models_1:
 | posible_repeats:  4  | 

In [8]:
print('Test MSE: ', round(sklearn.metrics.mean_squared_error(y_test, predict_test),4))

Test MSE:  8.6016
