In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install -q -U automl_alex

In [1]:
import automl_alex
import sklearn
import pandas as pd
import time
from automl_alex import DataPrepare
from automl_alex import AutoML, AutoMLClassifier, AutoMLRegressor
print('AutoML-Alex version:', automl_alex.__version__)

AutoML-Alex version: 1.02.15


In [2]:
RANDOM_SEED = 42

# Classifier

## Data

In [3]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
dataset = fetch_openml(name='credit-g', version=1, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,4.0,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,2.0,real estate,22.0,none,own,1.0,skilled,1.0,none,yes
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,3.0,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,4.0,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,4.0,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes


In [4]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.25, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((750, 20), (250, 20))

## AutoML

In [5]:
model = AutoMLClassifier(random_state=RANDOM_SEED,)
model = model.fit(X_train, y_train, timeout=600)

Source data shape:  (750, 20)
##################################################
! START preprocessing Data
- Auto detect cat features:  13
> Binary Features
> Clean Categorical Features
> Transform Categorical Features.
 - Encoder: OneHotEncoder ADD features: 52
> CleanOutliers
Num of outlier detected: 4 in Feature age
Proportion of outlier detected: 0.5 %
Num of outlier detected: 36 in Feature credit_amount
Proportion of outlier detected: 4.8 %
Num of outlier detected: 9 in Feature duration
Proportion of outlier detected: 1.2 %
  No nans features
> Generate interaction Num Features
 ADD features: 45
##################################################
Final data shape:  (750, 123)
Total ADD columns: 103
Memory usage of dataframe is 0.31 MB
Memory usage after optimization is: 0.20 MB
Decreased by 36.6%
##################################################
> Start Opt Model
classifier optimize: maximize
One iteration takes ~ 0.8 sec
> Start Auto calibration parameters
> Start optimization w

In [7]:
X_train.median()

duration                    18.0
credit_amount             2332.0
installment_commitment       3.0
residence_since              3.0
age                         33.0
existing_credits             1.0
num_dependents               1.0
dtype: float64

In [6]:
predicts = model.predict(X_test)

In [7]:
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.8076


In [8]:
c_X_test = model.de.transform(X_test)

In [9]:
predicts = model.model_1.predict_proba(c_X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.805


In [10]:
predicts = model.model_3.predict_proba(c_X_test[model._select_features_model_3])
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.7524


In [11]:
cs_X_test = model._scaler.transform(c_X_test)
predicts = model.model_4.predict_proba(cs_X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.7989


In [12]:
predicts = model.model_5.predict_proba(cs_X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.7974


In [16]:
model.__annotations__

AttributeError: 'AutoMLClassifier' object has no attribute '__annotations__'

## Save & Load

In [13]:
model.save('AutoML_model_1')

TypeError: cannot pickle 'dict_keys' object

In [None]:
model_new = AutoMLClassifier(random_state=RANDOM_SEED,)
model_new.load('AutoML_model_1')

In [None]:
predicts = model_new.predict(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

# Regression

## Data

In [4]:
# https://www.openml.org/d/543
dataset = fetch_openml(data_id=543, as_frame=True)

X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(dataset.data), 
                                                    pd.DataFrame(dataset.target), 
                                                    test_size=0.15, 
                                                    random_state=RANDOM_SEED,)

X_train.shape, X_test.shape

((430, 19), (76, 19))

In [5]:
X_train.head(5)

Unnamed: 0,TOWN,TOWN_ID,TRACT,LON,LAT,MEDV,CMEDV,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B
104,Medford,24.0,3395.0,-71.069,42.248,20.1,20.1,0.1396,0.0,8.56,0,0.52,6.167,90.0,2.421,5,384.0,20.9,392.69
203,Weston,37.0,3671.0,-71.199,42.232,48.5,48.5,0.0351,95.0,2.68,0,0.4161,7.853,33.2,5.118,4,224.0,14.7,392.78
381,Boston_East_Boston,79.0,407.0,-71.041,42.229,10.9,10.9,15.8744,0.0,18.1,0,0.671,6.545,99.1,1.5192,24,666.0,20.2,396.9
489,Chelsea,89.0,1602.0,-71.0228,42.2335,7.0,7.0,0.18337,0.0,27.74,0,0.609,5.414,98.3,1.7554,4,711.0,20.1,344.05
69,Wilmington,16.0,3313.0,-71.111,42.327,20.9,20.9,0.12816,12.5,6.07,0,0.409,5.885,33.0,6.498,4,345.0,18.9,396.9


## AutoML

In [6]:
model = AutoMLRegressor(random_state=RANDOM_SEED,)
model = model.fit(X_train, y_train, timeout=600, verbose=1)

Source data shape:  (430, 19)
##################################################
! START preprocessing Data
> Binary Features
> CleanOutliers
Num of outlier detected: 27 in Feature CMEDV
Proportion of outlier detected: 6.3 %
Num of outlier detected: 1 in Feature DIS
Proportion of outlier detected: 0.2 %
Num of outlier detected: 15 in Feature RM
Proportion of outlier detected: 3.5 %
Num of outlier detected: 60 in Feature B
Proportion of outlier detected: 14.0 %
Num of outlier detected: 49 in Feature ZN
Proportion of outlier detected: 11.4 %
Num of outlier detected: 18 in Feature LON
Proportion of outlier detected: 4.2 %
Num of outlier detected: 47 in Feature CRIM
Proportion of outlier detected: 10.9 %
Num of outlier detected: 26 in Feature MEDV
Proportion of outlier detected: 6.0 %
  No nans features
> Generate interaction Num Features
 ADD features: 360
##################################################
Final data shape:  (430, 395)
Total ADD columns: 376
Memory usage of dataframe is 0

CatBoostError: features data: pandas.DataFrame column 'TOWN' has dtype 'category' but is not in  cat_features list

In [None]:
predicts = model.predict(X_test)
print('Test MSE: ', round(sklearn.metrics.mean_squared_error(y_test, predicts),4))