In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install -q -U automl_alex

In [1]:
import sklearn
import pandas as pd
import time
import automl_alex
from automl_alex import DataPrepare
from automl_alex import BestSingleModel, BestSingleModelClassifier, BestSingleModelRegressor
print('AutoML-Alex version:', automl_alex.__version__)

AutoML-Alex version: 1.3.8


In [2]:
RANDOM_SEED = 42

# Classifier

## Data

In [3]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
# https://www.openml.org/d/179
dataset = fetch_openml(data_id=179, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [4]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.2, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((39073, 14), (9769, 14))

## BestSingleModel

In [5]:
model = BestSingleModelClassifier(
    clean_and_encod_data=True,
    opt_data_prepare=True,
    models_names = ['LightGBM', 'ExtraTrees', 'RandomForest',],
    cat_encoder_names= [
            "HelmertEncoder",
            "OneHotEncoder",
            "CountEncoder",
            "HashingEncoder",
            "BackwardDifferenceEncoder",
        ],
    target_encoders_names= [
            "TargetEncoder",
            "JamesSteinEncoder",
            "CatBoostEncoder",
        ],
    clean_outliers = [True, False],
    num_generator_select_operations = True,
    num_generator_operations = ["/", "*", "-"],
    auto_parameters=True,
    feature_selection=False,
    random_state=RANDOM_SEED,)

15:06:11 | metric is None! Default metric will be used. classifier: AUC, regression: MSE


In [6]:
history = model.opt(X_train, y_train, timeout=600, verbose=3)

17:48:07 | ##################################################
17:48:07 | > Step 1: calc parameters and pruned score: get test 10 trials
17:50:51 |  One iteration ~ 16.4 sec
17:50:51 |  Possible iters ~ 18.0
17:50:51 | ! Not enough time to find the optimal parameters. 
                     Possible iters < 100. 
                     Please, Increase the 'timeout' parameter for normal optimization.
17:50:51 | --------------------------------------------------
17:50:51 |   Pruned Threshold Score: 0.9054
17:50:51 | ##################################################
17:50:51 | > Step 2: Full opt with Threshold Score Pruner
17:50:51 | ##################################################
17:50:51 | > Start optimization with the parameters:
17:50:51 | CV_Folds = 7
17:50:51 | Score_CV_Folds = 2
17:50:51 | Feature_Selection = False
17:50:51 | Opt_lvl = 1
17:50:51 | Cold_start = 15
17:50:51 | Early_stoping = 25
17:50:51 | Metric = roc_auc_score
17:50:51 | Direction = maximize
17:50:51 | ###########

In [7]:
predicts = model.predict(X_test)

In [8]:
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.9122


In [9]:
model.best_model_name

'LightGBM'

In [10]:
model.best_model_param

{'random_seed': 42,
 'num_iterations': 300,
 'verbose': -1,
 'device_type': 'cpu',
 'objective': 'binary',
 'num_leaves': 4,
 'learning_rate': 0.2104512193446279}

In [None]:
# select Features 
# if feature_selection=True
#model.select_columns[:10]

In [22]:
model.study.best_params

{'*': True,
 '-': True,
 '/': True,
 'BackwardDifferenceEncoder': False,
 'CatBoostEncoder': True,
 'CountEncoder': True,
 'HashingEncoder': False,
 'HelmertEncoder': True,
 'JamesSteinEncoder': True,
 'OneHotEncoder': False,
 'TargetEncoder': False,
 'de_clean_outliers': True,
 'lgbm_learning_rate': 0.2104512193446279,
 'lgbm_num_leaves': 4,
 'model_name': 'LightGBM'}

In [35]:
history.sort_values('value', ascending=False).head(5)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_*,params_-,params_/,params_BackwardDifferenceEncoder,params_CatBoostEncoder,...,params_TargetEncoder,params_de_clean_outliers,params_ext_max_depth,params_ext_min_samples_split,params_lgbm_learning_rate,params_lgbm_num_leaves,params_model_name,params_rf_max_depth,params_rf_min_samples_split,state
12,12,0.9131,2021-03-06 17:51:04.898592,2021-03-06 17:51:18.432278,0 days 00:00:13.533686,True,True,True,False,True,...,False,True,,,0.210451,4.0,LightGBM,,,COMPLETE
8,8,0.9116,2021-03-06 17:50:14.236088,2021-03-06 17:50:42.930326,0 days 00:00:28.694238,True,False,True,False,True,...,False,True,,,0.012886,44.0,LightGBM,,,COMPLETE
0,0,0.9078,2021-03-06 17:48:07.752162,2021-03-06 17:48:26.015151,0 days 00:00:18.262989,False,False,True,True,True,...,True,False,,,,,RandomForest,80.0,53.0,COMPLETE
5,5,0.9066,2021-03-06 17:49:15.035773,2021-03-06 17:49:17.897428,0 days 00:00:02.861655,False,True,False,False,False,...,False,False,,,,,RandomForest,10.0,36.0,COMPLETE
9,9,0.9061,2021-03-06 17:50:42.997106,2021-03-06 17:50:51.803024,0 days 00:00:08.805918,False,False,False,False,True,...,True,True,,,,,RandomForest,10.0,36.0,COMPLETE


In [29]:
model.study.trials[8].params

{'model_name': 'LightGBM',
 'lgbm_num_leaves': 44,
 'lgbm_learning_rate': 0.012886065671894011,
 'HelmertEncoder': True,
 'OneHotEncoder': True,
 'CountEncoder': False,
 'HashingEncoder': True,
 'BackwardDifferenceEncoder': False,
 'TargetEncoder': False,
 'JamesSteinEncoder': False,
 'CatBoostEncoder': True,
 'de_clean_outliers': True,
 '/': True,
 '*': True,
 '-': False}

In [24]:
model = model.get_model_from_iter(X_train, y_train, model.study.trials[8].params)

Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations


In [25]:
predicts = model.predict(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.9097


## Save & Load

In [26]:
model.save('model_1')

Save model


In [27]:
model_new = BestSingleModelClassifier(random_state=RANDOM_SEED,)
model_new = model_new.load('model_1')

18:03:27 | Load DataPrepare
18:03:28 | Load Model
Finished loading model, total used 300 iterations
18:03:28 | Load Model
Finished loading model, total used 300 iterations
18:03:28 | Load Model
Finished loading model, total used 300 iterations
18:03:28 | Load Model
Finished loading model, total used 300 iterations
18:03:28 | Load Model
Finished loading model, total used 300 iterations
18:03:28 | Load Model
Finished loading model, total used 300 iterations
18:03:28 | Load Model
Finished loading model, total used 300 iterations
Load CrossValidation
Load model


In [28]:
predicts = model_new.predict(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.9097
