In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install -q -U automl_alex

In [1]:
import sklearn
import pandas as pd
import time
import automl_alex
from automl_alex import DataPrepare
from automl_alex import BestSingleModel, BestSingleModelClassifier, BestSingleModelRegressor
print('AutoML-Alex version:', automl_alex.__version__)

AutoML-Alex version: 1.3.5


In [2]:
RANDOM_SEED = 42

# Classifier

## Data

In [3]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
# https://www.openml.org/d/179
dataset = fetch_openml(data_id=179, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [4]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.2, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((39073, 14), (9769, 14))

### DataPrepare

In [5]:
de = DataPrepare(
    normalization=False, # if you use Linar based models -> True (for Tree based models normalization is not needed)
    random_state=RANDOM_SEED,
    )
X_train = de.fit_transform(X_train)
X_test = de.transform(X_test)
X_train.shape, X_test.shape

23:54:37 | Source data shape: (39073, 14)
23:54:37 | ##################################################
23:54:37 | ! START preprocessing Data
23:54:37 | - Auto detect cat features: 12
23:54:37 | > Binary Features
23:54:37 | > Clean Categorical Features
23:54:37 | > Transform Categorical Features.
23:54:38 |  - Encoder: HelmertEncoder ADD features: 123
23:54:38 |  - Encoder: CountEncoder ADD features: 12
23:54:38 |   No nans features
23:54:38 | > CleanOutliers
23:54:38 | Num of outlier detected: 253 in Feature education-num
23:54:38 | Proportion of outlier detected: 0.6 %
23:54:38 | Num of outlier detected: 560 in Feature fnlwgt
23:54:38 | Proportion of outlier detected: 1.4 %
23:54:38 | > Generate interaction Num Features
23:54:38 |  ADD features: 5
23:54:38 | ##################################################
23:54:38 | Final data shape: (39073, 156)
23:54:38 | Total ADD columns: 142
23:54:38 | ##################################################
23:54:38 | #############################

((39073, 156), (9769, 156))

## BestSingleModel

In [6]:
model = BestSingleModelClassifier(
    models_names = ['LightGBM', 'ExtraTrees', 'RandomForest', 'XGBoost'],
    auto_parameters=True,
    feature_selection=True,
    random_state=RANDOM_SEED,)

23:54:39 | metric is None! Default metric will be used. classifier: AUC, regression: MSE


In [7]:
history = model.opt(X_train, y_train, timeout=600, verbose=3)

23:54:40 | ##################################################
23:54:40 | > Step 1: calc parameters and pruned score: get test 10 trials
23:57:21 |  One iteration ~ 16.1 sec
23:57:21 |  Possible iters ~ 37.0
23:57:21 | ! Not enough time to find the optimal parameters. 
                     Possible iters < 100. 
                     Please, Increase the 'timeout' parameter for normal optimization.
23:57:21 | --------------------------------------------------
23:57:21 |   Pruned Threshold Score: 0.9016
23:57:21 | ##################################################
23:57:21 | > Step 2: Full opt with Threshold Score Pruner
23:57:21 | ##################################################
23:57:21 | > Start optimization with the parameters:
23:57:21 | CV_Folds = 7
23:57:21 | Score_CV_Folds = 2
23:57:21 | Feature_Selection = True
23:57:21 | Opt_lvl = 1
23:57:21 | Cold_start = 10
23:57:21 | Early_stoping = 25
23:57:21 | Metric = roc_auc_score
23:57:21 | Direction = maximize
23:57:21 | ############

In [8]:
predicts = model.predict(X_test)

In [9]:
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.9106


In [10]:
model.best_model_name

'LightGBM'

In [11]:
model.best_model_param

{'random_seed': 42,
 'num_iterations': 300,
 'verbose': -1,
 'device_type': 'cpu',
 'objective': 'binary',
 'num_leaves': 5,
 'learning_rate': 0.10826717203458593}

In [12]:
# select Features 
# if feature_selection=True
model.select_columns[:10]

['workclass',
 'fnlwgt',
 'education-num',
 'marital-status',
 'race',
 'sex',
 'capitalgain',
 'capitalloss',
 'hoursperweek',
 'native-country']

## Save & Load

In [13]:
model.save('model_1')

00:08:15 | Save Model
00:08:15 | Save Model
00:08:15 | Save Model
00:08:15 | Save Model
00:08:15 | Save Model
00:08:15 | Save Model
00:08:15 | Save Model
Save model


In [14]:
model_new = BestSingleModelClassifier(random_state=RANDOM_SEED,)
model_new = model_new.load('model_1')

00:08:16 | metric is None! Default metric will be used. classifier: AUC, regression: MSE
00:08:16 | Load Model
Finished loading model, total used 300 iterations
00:08:16 | Load Model
Finished loading model, total used 300 iterations
00:08:16 | Load Model
Finished loading model, total used 300 iterations
00:08:16 | Load Model
Finished loading model, total used 300 iterations
00:08:16 | Load Model
Finished loading model, total used 300 iterations
00:08:16 | Load Model
Finished loading model, total used 300 iterations
00:08:17 | Load Model
Finished loading model, total used 300 iterations
Load CrossValidation
Load model


In [15]:
predicts = model_new.predict(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.9106
