In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install -q -U automl_alex

In [1]:
import sklearn
import pandas as pd
import time
import automl_alex
from automl_alex import DataPrepare
from automl_alex import BestSingleModel, BestSingleModelClassifier, BestSingleModelRegressor
print('AutoML-Alex version:', automl_alex.__version__)

AutoML-Alex version: 1.3.8


In [2]:
RANDOM_SEED = 42

# Classifier

## Data

In [3]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
# https://www.openml.org/d/179
dataset = fetch_openml(data_id=179, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [4]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.2, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((39073, 14), (9769, 14))

## BestSingleModel

In [5]:
model = BestSingleModelClassifier(
    clean_and_encod_data=True,
    opt_data_prepare=True,
    models_names = ['LightGBM', 'ExtraTrees', 'LinearModel', 'RandomForest',],
    cat_encoder_names= [
            "HelmertEncoder",
            "OneHotEncoder",
            "CountEncoder",
            "HashingEncoder",
            "BackwardDifferenceEncoder",
        ],
    target_encoders_names= [
            "TargetEncoder",
            "JamesSteinEncoder",
            "CatBoostEncoder",
        ],
    clean_outliers = [True, False],
    num_generator_select_operations = True,
    num_generator_operations = ["/", "*", "-"],
    auto_parameters=True,
    feature_selection=False,
    random_state=RANDOM_SEED,)

19:14:35 | metric is None! Default metric will be used. classifier: AUC, regression: MSE


In [6]:
history = model.opt(X_train, y_train, timeout=600, verbose=3)

19:14:36 | ##################################################
19:14:36 | > Step 1: calc parameters and pruned score: get test 10 trials
19:16:29 |  One iteration ~ 11.3 sec
19:16:29 |  Possible iters ~ 53.0
19:16:29 | ! Not enough time to find the optimal parameters. 
                     Possible iters < 100. 
                     Please, Increase the 'timeout' parameter for normal optimization.
19:16:29 | --------------------------------------------------
19:16:29 |   Pruned Threshold Score: 0.9099
19:16:29 | ##################################################
19:16:29 | > Step 2: Full opt with Threshold Score Pruner
19:16:29 | ##################################################
19:16:29 | > Start optimization with the parameters:
19:16:29 | CV_Folds = 7
19:16:29 | Score_CV_Folds = 3
19:16:29 | Feature_Selection = False
19:16:29 | Opt_lvl = 1
19:16:29 | Cold_start = 15
19:16:29 | Early_stoping = 25
19:16:29 | Metric = roc_auc_score
19:16:29 | Direction = maximize
19:16:29 | ###########

In [7]:
predicts = model.predict(X_test)

In [8]:
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.9109


In [9]:
model.best_model_name

'LightGBM'

In [10]:
model.best_model_param

{'random_seed': 42,
 'num_iterations': 300,
 'verbose': -1,
 'device_type': 'cpu',
 'objective': 'binary',
 'num_leaves': 5,
 'learning_rate': 0.06333268775321842}

In [11]:
# select Features 
# if feature_selection=True
#model.select_columns[:10]

In [12]:
model.study.best_params

{'*': False,
 '-': False,
 '/': True,
 'BackwardDifferenceEncoder': True,
 'CatBoostEncoder': True,
 'CountEncoder': True,
 'HashingEncoder': True,
 'HelmertEncoder': True,
 'JamesSteinEncoder': True,
 'OneHotEncoder': True,
 'TargetEncoder': True,
 'de_clean_outliers': False,
 'lgbm_learning_rate': 0.06333268775321842,
 'lgbm_num_leaves': 5,
 'model_name': 'LightGBM'}

In [14]:
if model.direction == "maximize":
    top_10_cfg = history.sort_values('value', ascending=False).head(10)
else:
    top_10_cfg = history.sort_values('value', ascending=True).head(10)

In [37]:
top_10_cfg.number.values

array([ 8, 22, 21, 20, 12, 23, 24,  3, 10,  7])

In [29]:
model.study.trials[8].params

{'model_name': 'LightGBM',
 'lgbm_num_leaves': 44,
 'lgbm_learning_rate': 0.012886065671894011,
 'HelmertEncoder': True,
 'OneHotEncoder': True,
 'CountEncoder': False,
 'HashingEncoder': True,
 'BackwardDifferenceEncoder': False,
 'TargetEncoder': False,
 'JamesSteinEncoder': False,
 'CatBoostEncoder': True,
 'de_clean_outliers': True,
 '/': True,
 '*': True,
 '-': False}

In [42]:
model = model.get_model_from_iter(X_train, y_train, model.study.trials[8].params)

Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations


In [44]:
predicts = model.predict(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.9109


In [45]:
import copy

In [49]:
import os
import contextlib

In [55]:
top10_fited_models = {}
models_predicts = []

with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    for i in range(5):
        print(f'model_{i+1}')
        n_model = top_10_cfg.number.values[i]
        model_1 = model.get_model_from_iter(X_train, y_train, model.study.trials[n_model].params)
        predicts = model.predict(X_test)
        models_predicts.append(predicts)
        print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))
        top10_fited_models[f'model_{i+1}'] = copy.deepcopy(model)

In [59]:
with open(os.devnull, "w") as f, contextlib.redirect_stdout(f):
    model_1 = copy.deepcopy(model.get_model_from_iter(X_train, y_train, model.study.trials[top_10_cfg.number.values[0]].params))
    model_2 = copy.deepcopy(model.get_model_from_iter(X_train, y_train, model.study.trials[top_10_cfg.number.values[1]].params))
    model_3 = copy.deepcopy(model.get_model_from_iter(X_train, y_train, model.study.trials[top_10_cfg.number.values[2]].params))
    model_4 = copy.deepcopy(model.get_model_from_iter(X_train, y_train, model.study.trials[top_10_cfg.number.values[3]].params))
    model_5 = copy.deepcopy(model.get_model_from_iter(X_train, y_train, model.study.trials[top_10_cfg.number.values[4]].params))

Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations


In [60]:
models_predicts = []
models_predicts.append(model_1.predict(X_test))
models_predicts.append(model_2.predict(X_test))
models_predicts.append(model_3.predict(X_test))
models_predicts.append(model_4.predict(X_test))
models_predicts.append(model_5.predict(X_test))

In [62]:
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, pd.DataFrame(models_predicts).mean()),4))
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, pd.DataFrame(models_predicts).median()),4))

Test AUC:  0.9108
Test AUC:  0.9108


In [27]:
import numpy as np
import pandas as pd

In [24]:
model = model.get_model_from_iter(X_train, y_train, model.study.trials[8].params)

Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations


In [25]:
predicts = model.predict(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.9097


## Save & Load

In [26]:
model.save('model_1')

Save model


In [27]:
model_new = BestSingleModelClassifier(random_state=RANDOM_SEED,)
model_new = model_new.load('model_1')

18:03:27 | Load DataPrepare
18:03:28 | Load Model
Finished loading model, total used 300 iterations
18:03:28 | Load Model
Finished loading model, total used 300 iterations
18:03:28 | Load Model
Finished loading model, total used 300 iterations
18:03:28 | Load Model
Finished loading model, total used 300 iterations
18:03:28 | Load Model
Finished loading model, total used 300 iterations
18:03:28 | Load Model
Finished loading model, total used 300 iterations
18:03:28 | Load Model
Finished loading model, total used 300 iterations
Load CrossValidation
Load model


In [28]:
predicts = model_new.predict(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.9097
