In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install -q -U automl_alex

In [1]:
import automl_alex
import sklearn
import pandas as pd
import time
from automl_alex import DataPrepare
from automl_alex import AutoML, AutoMLClassifier, AutoMLRegressor
print('AutoML-Alex version:', automl_alex.__version__)

AutoML-Alex version: 1.2.25


In [2]:
RANDOM_SEED = 42

# Classifier

## Data

In [3]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
# https://www.openml.org/d/179
dataset = fetch_openml(data_id=179, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [4]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.25, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((36631, 14), (12211, 14))

## AutoML

In [5]:
model = AutoMLClassifier(random_state=RANDOM_SEED,)
model = model.fit(X_train, y_train, timeout=600)

Source data shape:  (36631, 14)
##################################################
! START preprocessing Data
- Auto detect cat features:  12
> Binary Features
> Clean Categorical Features
> Transform Categorical Features.
 - Encoder: HelmertEncoder ADD features: 123
 - Encoder: CountEncoder ADD features: 12
 - Encoder: HashingEncoder ADD features: 12
> CleanOutliers
Num of outlier detected: 231 in Feature education-num
Proportion of outlier detected: 0.6 %
Num of outlier detected: 527 in Feature fnlwgt
Proportion of outlier detected: 1.4 %
  No nans features
> Generate interaction Num Features
 ADD features: 3
> Reduce_Memory
Memory usage of dataframe is 24.38 MB
Memory usage after optimization is: 20.79 MB
Decreased by 14.8%
##################################################
Final data shape:  (36631, 166)
Total ADD columns: 152
##################################################
> Start Fit Base Models
##################################################
> Start Opt Model
classifier op

In [6]:
predicts = model.predict(X_test)

In [7]:
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.9125


## Save & Load

In [8]:
model.save('AutoML_model_3')

Save DataPrepare
Save AutoML


In [9]:
model_new = AutoMLClassifier(random_state=RANDOM_SEED,)
model_new = model_new.load('AutoML_model_3')

Load DataPrepare
Load AutoML


In [10]:
predicts = model_new.predict(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.9125


# Regression

## Data

In [15]:
# https://www.openml.org/d/543
dataset = fetch_openml(data_id=543, as_frame=True)

X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(dataset.data), 
                                                    pd.DataFrame(dataset.target), 
                                                    test_size=0.15, 
                                                    random_state=RANDOM_SEED,)

X_train.shape, X_test.shape

((430, 19), (76, 19))

In [16]:
X_train.head(5)

Unnamed: 0,TOWN,TOWN_ID,TRACT,LON,LAT,MEDV,CMEDV,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B
104,Medford,24.0,3395.0,-71.069,42.248,20.1,20.1,0.1396,0.0,8.56,0,0.52,6.167,90.0,2.421,5,384.0,20.9,392.69
203,Weston,37.0,3671.0,-71.199,42.232,48.5,48.5,0.0351,95.0,2.68,0,0.4161,7.853,33.2,5.118,4,224.0,14.7,392.78
381,Boston_East_Boston,79.0,407.0,-71.041,42.229,10.9,10.9,15.8744,0.0,18.1,0,0.671,6.545,99.1,1.5192,24,666.0,20.2,396.9
489,Chelsea,89.0,1602.0,-71.0228,42.2335,7.0,7.0,0.18337,0.0,27.74,0,0.609,5.414,98.3,1.7554,4,711.0,20.1,344.05
69,Wilmington,16.0,3313.0,-71.111,42.327,20.9,20.9,0.12816,12.5,6.07,0,0.409,5.885,33.0,6.498,4,345.0,18.9,396.9


## AutoML

In [17]:
model = AutoMLRegressor(random_state=RANDOM_SEED,)
model = model.fit(X_train, y_train, timeout=600, verbose=1)

Source data shape:  (430, 19)
##################################################
! START preprocessing Data
> Binary Features
> Clean Categorical Features
> CleanOutliers
Num of outlier detected: 47 in Feature CRIM
Proportion of outlier detected: 10.9 %
Num of outlier detected: 60 in Feature B
Proportion of outlier detected: 14.0 %
Num of outlier detected: 26 in Feature MEDV
Proportion of outlier detected: 6.0 %
Num of outlier detected: 27 in Feature CMEDV
Proportion of outlier detected: 6.3 %
Num of outlier detected: 1 in Feature DIS
Proportion of outlier detected: 0.2 %
Num of outlier detected: 49 in Feature ZN
Proportion of outlier detected: 11.4 %
Num of outlier detected: 15 in Feature RM
Proportion of outlier detected: 3.5 %
Num of outlier detected: 18 in Feature LON
Proportion of outlier detected: 4.2 %
  No nans features
> Generate interaction Num Features
 ADD features: 360
> Reduce_Memory
Memory usage of dataframe is 0.73 MB
Memory usage after optimization is: 0.62 MB
Decrease

In [18]:
predicts = model.predict(X_test)
print('Test MSE: ', round(sklearn.metrics.mean_squared_error(y_test, predicts),4))

Test MSE:  6.7483
