In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
#!pip install -q -U automl_alex

In [1]:
import sklearn
import pandas as pd
import time
import automl_alex
from automl_alex import DataPrepare
from automl_alex import BestSingleModel, BestSingleModelClassifier, BestSingleModelRegressor
print('AutoML-Alex version:', automl_alex.__version__)

AutoML-Alex version: 1.2.28


In [2]:
RANDOM_SEED = 42

# Classifier

## Data

In [3]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
# https://www.openml.org/d/179
dataset = fetch_openml(data_id=179, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native-country
0,2,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,1,0,2,United-States
1,3,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,0,United-States
2,2,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,2,United-States
3,3,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,2,United-States
4,1,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,2,Cuba


In [4]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.2, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((39073, 14), (9769, 14))

### DataPrepare

In [5]:
de = DataPrepare(
    normalization=False, # if you use Linar based models -> True (for Tree based models normalization is not needed)
    random_state=RANDOM_SEED,
    )
X_train = de.fit_transform(X_train)
X_test = de.transform(X_test)
X_train.shape, X_test.shape

00:49:41 | Source data shape: (39073, 14)
00:49:41 | ##################################################
00:49:41 | ! START preprocessing Data
00:49:42 | - Auto detect cat features: 12
00:49:42 | > Binary Features
00:49:42 | > Clean Categorical Features
00:49:42 | > Transform Categorical Features.
00:49:43 |  - Encoder: HelmertEncoder ADD features: 123
00:49:43 |  - Encoder: CountEncoder ADD features: 12
00:49:43 | > CleanOutliers
00:49:43 | Num of outlier detected: 253 in Feature education-num
00:49:43 | Proportion of outlier detected: 0.6 %
00:49:43 | Num of outlier detected: 560 in Feature fnlwgt
00:49:43 | Proportion of outlier detected: 1.4 %
00:49:43 |   No nans features
00:49:43 | > Generate interaction Num Features
00:49:43 |  ADD features: 5
00:49:43 | ##################################################
00:49:43 | Final data shape: (39073, 156)
00:49:43 | Total ADD columns: 142
00:49:43 | ##################################################
00:49:43 | #############################

((39073, 156), (9769, 156))

## BestSingleModel

In [6]:
model = BestSingleModelClassifier(
    models_names = ['LightGBM', 'ExtraTrees', 'RandomForest', 'XGBoost'],
    auto_parameters=True,
    feature_selection=True,
    random_state=RANDOM_SEED,)

00:49:43 | metric is None! Default metric will be used. classifier: AUC, regression: MSE


In [7]:
model.opt(X_train, y_train, timeout=600, verbose=3)

00:49:49 | ##################################################
00:49:49 | > Step 1: calc parameters and pruned score: get test 10 trials
00:51:40 |  One iteration ~ 11.1 sec
00:51:40 |  Possible iters ~ 54.0
00:51:40 | ! Not enough time to find the optimal parameters. 
                     Possible iters < 100. 
                     Please, Increase the 'timeout' parameter for normal optimization.
00:51:40 | --------------------------------------------------
00:51:40 |   Pruned Threshold Score: 0.9061
00:51:40 | ##################################################
00:51:40 | > Step 2: Full opt with Threshold Score Pruner
00:51:40 | ##################################################
00:51:40 | > Start optimization with the parameters:
00:51:40 | CV_Folds = 10
00:51:40 | Score_CV_Folds = 2
00:51:40 | Feature_Selection = True
00:51:40 | Opt_lvl = 1
00:51:40 | Cold_start = 10
00:51:40 | Early_stoping = 25
00:51:40 | Metric = roc_auc_score
00:51:40 | Direction = maximize
00:51:40 | ###########

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_CountEncoder_age,params_CountEncoder_capitalgain,params_CountEncoder_capitalloss,params_CountEncoder_education,params_CountEncoder_education-num,...,params_occupation,params_race,params_relationship,params_rf_max_depth,params_rf_min_samples_split,params_sex,params_workclass,params_xgb_max_depth,params_xgb_min_child_weight,state
0,0,0.9048,2021-03-02 00:49:49.765976,2021-03-02 00:49:56.262464,0 days 00:00:06.496488,False,True,True,True,True,...,True,False,True,80.0,53.0,True,True,,,COMPLETE
1,1,0.769,2021-03-02 00:49:56.322477,2021-03-02 00:50:38.423642,0 days 00:00:42.101165,True,False,True,False,True,...,True,False,True,,,False,False,5.0,3.0,COMPLETE
2,2,0.9151,2021-03-02 00:50:38.534579,2021-03-02 00:50:46.779759,0 days 00:00:08.245180,True,False,True,True,False,...,False,False,True,,,False,True,,,COMPLETE
3,3,0.914,2021-03-02 00:50:46.896133,2021-03-02 00:50:52.614083,0 days 00:00:05.717950,True,True,True,True,False,...,False,True,True,,,True,True,,,COMPLETE
4,4,0.9144,2021-03-02 00:50:52.735636,2021-03-02 00:51:04.081883,0 days 00:00:11.346247,True,False,True,False,False,...,False,True,True,,,True,True,,,COMPLETE
5,5,0.9012,2021-03-02 00:51:04.167992,2021-03-02 00:51:11.884367,0 days 00:00:07.716375,True,False,True,False,False,...,False,True,True,90.0,85.0,True,True,,,COMPLETE
6,6,0.877,2021-03-02 00:51:11.941630,2021-03-02 00:51:20.307728,0 days 00:00:08.366098,False,False,True,True,False,...,True,False,False,50.0,2.0,False,False,,,COMPLETE
7,7,0.9083,2021-03-02 00:51:20.361832,2021-03-02 00:51:27.396670,0 days 00:00:07.034838,True,True,True,False,True,...,True,True,True,20.0,39.0,False,False,,,COMPLETE
8,8,0.9052,2021-03-02 00:51:27.463799,2021-03-02 00:51:34.037658,0 days 00:00:06.573859,False,False,False,True,True,...,True,False,False,100.0,87.0,True,False,,,COMPLETE
9,9,0.907,2021-03-02 00:51:34.098910,2021-03-02 00:51:40.548988,0 days 00:00:06.450078,False,True,False,False,True,...,False,True,True,10.0,36.0,True,False,,,COMPLETE


In [8]:
predicts = model.predict(X_test)

In [9]:
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.9098


In [10]:
model.best_model_name

'LightGBM'

In [11]:
model.best_model_param

{'random_seed': 42,
 'num_iterations': 300,
 'verbose': -1,
 'device_type': 'cpu',
 'objective': 'binary',
 'num_leaves': 22,
 'min_child_samples': 99}

In [17]:
# select Features 
# if feature_selection=True
model.select_columns[:10]

['age',
 'workclass',
 'education-num',
 'marital-status',
 'capitalgain',
 'capitalloss',
 'hoursperweek',
 'HelmertEncoder_age_0',
 'HelmertEncoder_age_2',
 'HelmertEncoder_workclass_0']

## Save & Load

In [13]:
model.save('model_1')

01:02:38 | Save Model
01:02:38 | Save Model
01:02:38 | Save Model
01:02:38 | Save Model
01:02:38 | Save Model
01:02:38 | Save Model
01:02:38 | Save Model
01:02:38 | Save Model
01:02:39 | Save Model
01:02:39 | Save Model
Save model


In [14]:
model_new = BestSingleModelClassifier(random_state=RANDOM_SEED,)
model_new = model_new.load('model_1')

01:02:40 | metric is None! Default metric will be used. classifier: AUC, regression: MSE
01:02:41 | Load Model
Finished loading model, total used 300 iterations
01:02:41 | Load Model
Finished loading model, total used 300 iterations
01:02:41 | Load Model
Finished loading model, total used 300 iterations
01:02:41 | Load Model
Finished loading model, total used 300 iterations
01:02:41 | Load Model
Finished loading model, total used 300 iterations
01:02:41 | Load Model
Finished loading model, total used 300 iterations
01:02:41 | Load Model
Finished loading model, total used 300 iterations
01:02:41 | Load Model
Finished loading model, total used 300 iterations
01:02:41 | Load Model
Finished loading model, total used 300 iterations
01:02:41 | Load Model
Finished loading model, total used 300 iterations
Load CrossValidation
Load CrossValidation


In [15]:
predicts = model_new.predict(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.9098
