In [1]:
# If you run this notebook on Google Colaboratory, uncomment the below to install automl_alex.
!pip install -q -U automl_alex

[0m

In [2]:
import sklearn
import pandas as pd
import time
import automl_alex
from automl_alex import DataPrepare
from automl_alex import BestSingleModel, BestSingleModelClassifier, BestSingleModelRegressor
print('AutoML-Alex version:', automl_alex.__version__)

  from .autonotebook import tqdm as notebook_tqdm


AutoML-Alex version: 2023.3.11


In [3]:
RANDOM_SEED = 42

# Classifier

## Data

In [4]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
# https://www.openml.org/d/31
dataset = fetch_openml(data_id=31, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes
dataset.data.head(5)

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,residence_since,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,4.0,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,2.0,real estate,22.0,none,own,1.0,skilled,1.0,none,yes
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,3.0,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,4.0,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,4.0,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes


In [5]:
X_train, X_test, y_train, y_test = train_test_split(dataset.data, 
                                                    dataset.target,
                                                    test_size=0.2, 
                                                    random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((800, 20), (200, 20))

## BestSingleModel

In [6]:
model = BestSingleModelClassifier(
    clean_and_encod_data=True,
    opt_data_prepare=True,
    models_names = ['LightGBM', 'XGBoost', 'LinearModel'],
    cat_encoder_names= [
            "HelmertEncoder",
            "OneHotEncoder",
            "CountEncoder",
            "HashingEncoder",
            "BackwardDifferenceEncoder",
        ],
    clean_outliers = [True, False],
    num_generator_select_operations = True,
    num_generator_operations = ["/", "*", "-"],
    auto_parameters=True,
    feature_selection=True,
    random_state=RANDOM_SEED,)

[32m19:24:34[0m | [1mmetric is None! Default metric will be used. classifier: AUC, regression: MSE[0m


In [7]:
history = model.opt(X_train, y_train, timeout=600, verbose=3)

[32m19:24:34[0m | [1m##################################################[0m
[32m19:24:34[0m | [1m> Step 1: calc parameters and pruned score: get test 10 trials[0m
[32m19:24:53[0m | [1m One iteration ~ 1.8 sec[0m
[32m19:24:53[0m | [1m Possible iters ~ 327.0[0m
[32m19:24:53[0m | [1m--------------------------------------------------[0m
[32m19:24:53[0m | [1m  Pruned Threshold Score: 0.6228[0m
[32m19:24:53[0m | [1m##################################################[0m
[32m19:24:53[0m | [1m> Step 2: Full opt with Threshold Score Pruner[0m
[32m19:24:53[0m | [1m##################################################[0m
[32m19:24:53[0m | [1m> Start optimization with the parameters:[0m
[32m19:24:53[0m | [1mCV_Folds = 7[0m
[32m19:24:53[0m | [1mScore_CV_Folds = 3[0m
[32m19:24:53[0m | [1mFeature_Selection = True[0m
[32m19:24:53[0m | [1mOpt_lvl = 2[0m
[32m19:24:53[0m | [1mCold_start = 20[0m
[32m19:24:53[0m | [1mEarly_stoping = 30[0m
[32m19:24

In [8]:
predicts = model.predict(X_test)

In [9]:
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.8024


In [10]:
model.best_model_name

'LinearModel'

In [11]:
model.best_model_param

{'fit_intercept': False,
 'C': 13.348092024903151,
 'solver': 'liblinear',
 'tol': 0.043777538481459946,
 'class_weight': None,
 'n_jobs': 1}

In [12]:
# select Features 
# if feature_selection=True
#model.select_columns[:10]

In [13]:
model.study.best_params

{'*': False,
 '-': True,
 '/': False,
 'BackwardDifferenceEncoder': True,
 'BackwardDifferenceEncoder_checking_status_0': True,
 'BackwardDifferenceEncoder_checking_status_1': False,
 'BackwardDifferenceEncoder_checking_status_2': True,
 'BackwardDifferenceEncoder_credit_history_0': True,
 'BackwardDifferenceEncoder_credit_history_1': True,
 'BackwardDifferenceEncoder_credit_history_2': True,
 'BackwardDifferenceEncoder_credit_history_3': True,
 'BackwardDifferenceEncoder_employment_0': True,
 'BackwardDifferenceEncoder_employment_1': True,
 'BackwardDifferenceEncoder_employment_2': False,
 'BackwardDifferenceEncoder_employment_3': True,
 'BackwardDifferenceEncoder_existing_credits_0': False,
 'BackwardDifferenceEncoder_existing_credits_1': False,
 'BackwardDifferenceEncoder_existing_credits_2': True,
 'BackwardDifferenceEncoder_housing_0': True,
 'BackwardDifferenceEncoder_housing_1': True,
 'BackwardDifferenceEncoder_installment_commitment_0': True,
 'BackwardDifferenceEncoder_instal

In [14]:
history.sort_values('value', ascending=False).head(5)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_*,params_-,params_/,params_BackwardDifferenceEncoder,params_BackwardDifferenceEncoder_checking_status_0,...,params_purpose,params_residence_since,params_residence_since_Is_Outliers_IQR,params_savings_status,params_xgb_colsample_bytree,params_xgb_learning_rate,params_xgb_max_depth,params_xgb_min_child_weight,params_xgb_subsample,state
54,54,0.746,2023-03-09 19:26:38.333078,2023-03-09 19:26:39.651560,0 days 00:00:01.318482,False,True,False,True,True,...,True,False,False,False,,,,,,COMPLETE
55,55,0.7459,2023-03-09 19:26:39.675467,2023-03-09 19:26:41.081892,0 days 00:00:01.406425,False,True,False,True,True,...,True,False,False,False,,,,,,COMPLETE
105,105,0.7457,2023-03-09 19:27:57.468501,2023-03-09 19:27:58.971463,0 days 00:00:01.502962,False,True,False,True,True,...,True,False,False,False,,,,,,COMPLETE
104,104,0.7454,2023-03-09 19:27:55.968897,2023-03-09 19:27:57.444682,0 days 00:00:01.475785,False,True,False,True,True,...,True,False,False,False,,,,,,COMPLETE
102,102,0.7453,2023-03-09 19:27:52.971664,2023-03-09 19:27:54.459649,0 days 00:00:01.487985,False,True,False,True,True,...,True,False,False,False,,,,,,COMPLETE


In [15]:
model.study.trials[8].params

{'model_name': 'LinearModel',
 'lr_fit_intercept': False,
 'lr_C': 76.51183782805862,
 'lr_solver': 'liblinear',
 'lr_tol': 0.0018242400350534868,
 'lr_class_weight': None,
 'HelmertEncoder': False,
 'OneHotEncoder': False,
 'CountEncoder': False,
 'HashingEncoder': True,
 'BackwardDifferenceEncoder': True,
 'de_clean_outliers': False,
 '/': True,
 '*': False,
 '-': True,
 'num_dependents': True,
 'own_telephone': False,
 'foreign_worker': False,
 'BackwardDifferenceEncoder_checking_status_0': True,
 'BackwardDifferenceEncoder_checking_status_1': True,
 'BackwardDifferenceEncoder_checking_status_2': True,
 'BackwardDifferenceEncoder_credit_history_0': True,
 'BackwardDifferenceEncoder_credit_history_1': False,
 'BackwardDifferenceEncoder_credit_history_2': True,
 'BackwardDifferenceEncoder_credit_history_3': False,
 'BackwardDifferenceEncoder_savings_status_0': True,
 'BackwardDifferenceEncoder_savings_status_1': True,
 'BackwardDifferenceEncoder_savings_status_2': False,
 'BackwardDif

In [16]:
model = model.get_model_from_iter(X_train, y_train, model.study.trials[8].params)

In [17]:
predicts = model.predict(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.8066


## Save & Load

In [18]:
model.save('model_1')

Save model


In [19]:
model_new = BestSingleModelClassifier(random_state=RANDOM_SEED,)
model_new = model_new.load('model_1')

Load CrossValidation
Load model


In [20]:
predicts = model_new.predict(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.8066
