In [1]:
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml

import automl_alex
from automl_alex import DataPrepare
from automl_alex import CrossValidation

In [2]:
RANDOM_SEED = 42

In [3]:
import optuna
import time

In [4]:
# add dataset whis more num features
dataset = fetch_openml(name='credit-g', version=1, as_frame=True)
dataset.target = dataset.target.astype('category').cat.codes

X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.2, random_state=RANDOM_SEED,)
X_train.shape, X_test.shape

((800, 20), (200, 20))

In [5]:
de = DataPrepare(
    normalization=False, # if you use Linar based models -> True (for Tree based models normalization is not needed)
    random_state=RANDOM_SEED,
    )
X_train = de.fit_transform(X_train)
X_test = de.transform(X_test)
X_train.shape, X_test.shape

09:12:55 | Source data shape: (800, 20)
09:12:55 | ##################################################
09:12:55 | ! START preprocessing Data
09:12:55 | - Auto detect cat features: 13
09:12:55 | > Binary Features
09:12:55 | > Clean Categorical Features
09:12:55 | > Transform Categorical Features.
09:12:55 |  - Encoder: HelmertEncoder ADD features: 39
09:12:55 |  - Encoder: CountEncoder ADD features: 13
09:12:55 | > CleanOutliers
09:12:55 | Num of outlier detected: 38 in Feature credit_amount
09:12:55 | Proportion of outlier detected: 4.8 %
09:12:55 | Num of outlier detected: 6 in Feature age
09:12:55 | Proportion of outlier detected: 0.8 %
09:12:55 | Num of outlier detected: 10 in Feature duration
09:12:55 | Proportion of outlier detected: 1.2 %
09:12:55 |   No nans features
09:12:55 | > Generate interaction Num Features
09:12:55 |  ADD features: 45
09:12:55 | > Reduce_Memory
09:12:55 | Memory usage of dataframe is 0.45 MB
09:12:55 | Memory usage after optimization is: 0.33 MB
09:12:55 |

((800, 123), (200, 123))

In [6]:
from automl_alex import LightGBM, LightGBMClassifier

# We will use LightGBM for this dataset
model = LightGBM(type_of_estimator='classifier', random_state=RANDOM_SEED)

# or Simply
model = LightGBMClassifier(random_state=RANDOM_SEED)

In [7]:
from automl_alex.optimizer import *

In [8]:
optimizer = Optimizer(model)

In [9]:
model = optimizer.opt(X_train, y_train, timeout=200, verbose=3)

09:10:51 | One iteration takes ~ 8.8 sec
09:10:51 | Not enough time to find the optimal parameters. 
                     Possible iters < 100. 
                     Please, Increase the 'timeout' parameter for normal optimization.
09:10:51 | > Start optimization with the parameters:
09:10:51 | CV_Folds = 5
09:10:51 | Score_CV_Folds = 1
09:10:51 | Feature_Selection = True
09:10:51 | Opt_lvl = 1
09:10:51 | Cold_start = 10
09:10:51 | Early_stoping = 25
09:10:51 | Metric = roc_auc_score
09:10:51 | Direction = maximize
09:10:51 | ##################################################
09:10:51 | > Step 1: calc pruned score => get 10 trials


KeyboardInterrupt: 

In [None]:
model = model.fit(X_train, y_train)
predicts = model.predict_proba(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

In [None]:
model.select_columns

In [20]:
# time 1 iter
start_time = time.time()
model = model.fit(X_train, y_train)
iter_time = (time.time() - start_time)

In [21]:
timeout = 600

In [22]:
possible_iters = timeout // (iter_time)
possible_iters

307.0

In [23]:
def auto_parameters_calc(possible_iters,):
        """
        Automatic determination of optimization parameters depending on the number of possible iterations

        Args:
            possible_iters (int): possible_iters
            verbose (int): print status

        Return:
            early_stoping (int)
            cv (int)
            score_cv_folds (int)
            opt_lvl (int)
            cold_start (int)
        """
        early_stoping = 25
        folds = 5
        score_folds = 1
        opt_lvl = 1
        cold_start = 10
            
        if possible_iters > 50:
            folds = 10
            score_folds = 2
            cold_start = (possible_iters / score_folds) // 2
            early_stoping = 30

        if possible_iters > 100:
            opt_lvl = 2

        if possible_iters > 300:
            opt_lvl = 3
            score_folds = 3
            cold_start = (possible_iters / score_folds) // 3
            early_stoping = cold_start * 2

        if possible_iters > 500:
            opt_lvl = 4
            score_folds = 4
            cold_start = (possible_iters / score_folds) // 4
            early_stoping = cold_start * 2
        
        if possible_iters > 1000:
            opt_lvl = 5
            score_folds = 5
            cold_start = (possible_iters / score_folds) // 4
            early_stoping = cold_start * 2

        return(early_stoping, folds, score_folds, opt_lvl, cold_start,)

In [24]:
early_stoping, folds, score_folds, opt_lvl, cold_start = \
                    auto_parameters_calc(possible_iters)

In [25]:
def objective(
    trial,
    model, 
    folds, 
    score_folds, 
    opt_lvl,
    return_model=False,
    ):
    model.model_param = model.get_model_opt_params(
            trial=trial, 
            opt_lvl=opt_lvl,
            )
    cv = CrossValidation(
        estimator=model,
        folds=folds,
        score_folds=score_folds,
        n_repeats=1,
        metric=sklearn.metrics.roc_auc_score,
        print_metric=False, 
        metric_round=4, 
        random_state=RANDOM_SEED
        )
    score, score_std = cv.fit_score(X_train, y_train, print_metric=False, trial=trial)
    score_opt = model.__calc_combined_score_opt__('maximize', score, score_std)
            
    if cv._pruned_cv:
        raise optuna.TrialPruned()
    if return_model:
        return(model)
    else:
        return(score_opt)

In [26]:
optuna.logging.disable_default_handler()

In [27]:
model = LightGBMClassifier(random_state=RANDOM_SEED)

In [28]:
sampler = optuna.samplers.TPESampler(seed=42)

In [29]:
study = optuna.create_study(
    sampler=sampler,
    direction='maximize',
    pruner=optuna.pruners.NopPruner(),
    )

In [30]:
dafault_params = model.get_model_start_opt_params()

In [31]:
study.enqueue_trial(dafault_params)

In [32]:
study.optimize(lambda trial: objective(
                                    trial, 
                                    model,
                                    folds, 
                                    score_folds, 
                                    opt_lvl,
                                    ),
                n_trials=10, 
                show_progress_bar=False)

In [33]:
study.best_params 

{'lgbm_min_child_samples': 3,
 'lgbm_bagging_fraction': 1.0,
 'lgbm_learning_rate': 0.13962563737015762,
 'lgbm_num_iterations': 400,
 'lgbm_num_leaves': 8}

In [34]:
study.best_trial

FrozenTrial(number=9, values=[0.69533], datetime_start=datetime.datetime(2021, 2, 28, 2, 39, 27, 790560), datetime_complete=datetime.datetime(2021, 2, 28, 2, 39, 30, 23880), params={'lgbm_min_child_samples': 3, 'lgbm_bagging_fraction': 1.0, 'lgbm_learning_rate': 0.13962563737015762, 'lgbm_num_iterations': 400, 'lgbm_num_leaves': 8}, distributions={'lgbm_min_child_samples': IntLogUniformDistribution(high=100, low=2, step=1), 'lgbm_bagging_fraction': DiscreteUniformDistribution(high=1.0, low=0.4, q=0.1), 'lgbm_learning_rate': LogUniformDistribution(high=0.3, low=0.01), 'lgbm_num_iterations': IntUniformDistribution(high=1000, low=300, step=100), 'lgbm_num_leaves': IntLogUniformDistribution(high=100, low=2, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={0: 0.7309}, trial_id=9, state=TrialState.COMPLETE, value=None)

In [35]:
study.best_value

0.69533

In [37]:
trial.trials

NameError: name 'trial' is not defined

In [21]:
df = study.trials_dataframe()

In [22]:
pruned_scor = df.value.median()

In [23]:
study.pruner = optuna.pruners.ThresholdPruner(lower=pruned_scor)

In [24]:
study.optimize(lambda trial: objective(
                                    trial, 
                                    model,
                                    folds, 
                                    score_folds, 
                                    opt_lvl,
                                    ),
                timeout=600,
                show_progress_bar=False)

00:52:14 | Pruned on 0 fold


In [25]:
optuna.visualization.plot_optimization_history(study)

In [26]:
df = study.trials_dataframe()
df

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_lgbm_bagging_fraction,params_lgbm_bagging_freq,params_lgbm_feature_fraction,params_lgbm_learning_rate,params_lgbm_min_child_samples,params_lgbm_num_iterations,params_lgbm_num_leaves,system_attrs_fixed_params,state
0,0,0.644032,2021-02-28 00:47:33.790410,2021-02-28 00:47:41.385382,0 days 00:00:07.594972,0.6,9.0,1.0,0.100000,20,300,31,"{'lgbm_num_leaves': 31, 'lgbm_min_child_sample...",COMPLETE
1,1,0.662644,2021-02-28 00:47:41.386177,2021-02-28 00:47:48.771069,0 days 00:00:07.384892,0.5,9.0,0.5,0.031112,19,1000,29,,COMPLETE
2,2,0.679723,2021-02-28 00:47:48.771769,2021-02-28 00:47:49.858986,0 days 00:00:01.087217,1.0,,,0.169675,2,800,2,,COMPLETE
3,3,0.654467,2021-02-28 00:47:49.859580,2021-02-28 00:47:51.066421,0 days 00:00:01.206841,0.8,10.0,0.8,0.026927,97,500,8,,COMPLETE
4,4,0.699566,2021-02-28 00:47:51.067120,2021-02-28 00:48:09.422505,0 days 00:00:18.355385,1.0,,,0.022071,2,800,41,,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,341,0.729071,2021-02-28 00:58:05.393539,2021-02-28 00:58:07.247810,0 days 00:00:01.854271,1.0,,,0.041963,2,300,4,,COMPLETE
342,342,0.720445,2021-02-28 00:58:07.249320,2021-02-28 00:58:09.656463,0 days 00:00:02.407143,1.0,,,0.045403,2,300,7,,COMPLETE
343,343,0.745344,2021-02-28 00:58:09.657982,2021-02-28 00:58:12.227924,0 days 00:00:02.569942,1.0,,,0.051121,2,300,5,,COMPLETE
344,344,0.729556,2021-02-28 00:58:12.229945,2021-02-28 00:58:17.761292,0 days 00:00:05.531347,1.0,,,0.055252,2,300,5,,COMPLETE


In [27]:
df.state.value_counts()

COMPLETE    345
PRUNED        1
Name: state, dtype: int64

In [28]:
study.best_params

{'lgbm_min_child_samples': 2,
 'lgbm_bagging_fraction': 1.0,
 'lgbm_learning_rate': 0.04348842010570929,
 'lgbm_num_iterations': 300,
 'lgbm_num_leaves': 5}

In [29]:
model = LightGBMClassifier(random_state=RANDOM_SEED)
model.model_param

{'random_seed': 42,
 'num_iterations': 300,
 'verbose': -1,
 'device_type': 'cpu',
 'objective': 'binary'}

In [30]:
model = objective(optuna.trial.FixedTrial(study.best_params), 
                                    model,
                                    folds, 
                                    score_folds, 
                                    opt_lvl, 
                                    return_model=True,)

In [31]:
model.model_param

{'random_seed': 42,
 'num_iterations': 300,
 'verbose': -1,
 'device_type': 'cpu',
 'objective': 'binary',
 'min_child_samples': 2,
 'bagging_fraction': 1.0,
 'learning_rate': 0.04348842010570929,
 'num_leaves': 5}

In [33]:
cv = CrossValidation(
        estimator=model,
        folds=folds,
        score_folds=score_folds,
        n_repeats=1,
        metric=sklearn.metrics.roc_auc_score,
        print_metric=False, 
        metric_round=4, 
        random_state=RANDOM_SEED
        )

In [34]:
cv.fit(X_train, y_train)

Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations


In [35]:
predicts = cv.predict_test(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.8283


In [36]:
model = LightGBMClassifier(random_state=RANDOM_SEED)

In [37]:
cv = CrossValidation(
        estimator=model,
        folds=folds,
        score_folds=folds,
        n_repeats=1,
        metric=sklearn.metrics.roc_auc_score,
        print_metric=False, 
        metric_round=4, 
        random_state=RANDOM_SEED
        )
cv.fit(X_train, y_train)

Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations


In [38]:
predicts = cv.predict_test(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.8161


In [7]:
from automl_alex.optimizer import *

In [8]:
opt = Optimizer(model)

In [9]:
model = opt.opt(X_train, y_train, timeout=300, verbose=3)

63870143504, 'lgbm_num_leaves': 9, 'lgbm_num_iterations': 400, 'lgbm_bagging_fraction': 0.8, 'lgbm_feature_fraction': 0.6000000000000001, 'lgbm_bagging_freq': 9}. Best is trial 117 with value: 0.74555.
[I 2021-02-28 02:15:37,572] Trial 134 finished with value: 0.72 and parameters: {'lgbm_min_child_samples': 2, 'lgbm_learning_rate': 0.03533649508899552, 'lgbm_num_leaves': 4, 'lgbm_num_iterations': 500, 'lgbm_bagging_fraction': 1.0}. Best is trial 117 with value: 0.74555.
[I 2021-02-28 02:15:38,113] Trial 135 finished with value: 0.6972499999999999 and parameters: {'lgbm_min_child_samples': 8, 'lgbm_learning_rate': 0.05647341589411643, 'lgbm_num_leaves': 7, 'lgbm_num_iterations': 300, 'lgbm_bagging_fraction': 0.8, 'lgbm_feature_fraction': 0.7000000000000001, 'lgbm_bagging_freq': 8}. Best is trial 117 with value: 0.74555.
[I 2021-02-28 02:15:39,271] Trial 136 finished with value: 0.7082499999999999 and parameters: {'lgbm_min_child_samples': 2, 'lgbm_learning_rate': 0.049274134301148874, '

In [42]:
model = model.fit(X_train, y_train)
predicts = model.predict_proba(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.8277


In [43]:
cv = CrossValidation(
        estimator=model,
        folds=opt.folds,
        score_folds=folds,
        n_repeats=1,
        print_metric=False, 
        metric_round=4, 
        random_state=RANDOM_SEED
        )
cv.fit(X_train, y_train)

Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations


In [44]:
predicts = cv.predict_test(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.827


In [49]:
predicts = model.predict_proba(X_test)
print('Test AUC: ', round(sklearn.metrics.roc_auc_score(y_test, predicts),4))

Test AUC:  0.8151


In [50]:
# or just use .score()
score = model.score(
    X_test, y_test,
    metric=sklearn.metrics.roc_auc_score,
    metric_round=4,
    print_metric=True,
    )

23:24:29 | roc_auc_score: 0.8151


In [51]:
from automl_alex import CrossValidation

In [52]:
cv = CrossValidation(
        estimator=model,
        folds=10,
        score_folds=5,
        n_repeats=1,
        metric=sklearn.metrics.roc_auc_score,
        print_metric=False, 
        metric_round=4, 
        random_state=RANDOM_SEED
        )

In [53]:
score, score_std = cv.fit_score(X_train, y_train, print_metric=False)

In [54]:
start_score = score - score_std

In [55]:
model.model_param

{'random_seed': 42,
 'num_iterations': 300,
 'verbose': -1,
 'device_type': 'cpu',
 'objective': 'binary'}

In [17]:
def objective(trial, model):
    model.model_param = model.get_model_opt_params(
            trial=trial, 
            opt_lvl=4,
            )
    cv = CrossValidation(
        estimator=model,
        folds=10,
        score_folds=3,
        n_repeats=1,
        metric=sklearn.metrics.roc_auc_score,
        print_metric=False, 
        metric_round=4, 
        random_state=RANDOM_SEED
        )
    score, score_std = cv.fit_score(X_train, y_train, print_metric=False, trial=trial)
    if cv._pruned_cv:
        raise optuna.TrialPruned()
    return(score)

In [15]:
def objective(trial):
    model = LightGBMClassifier(random_state=RANDOM_SEED)
    model.model_param = model.get_model_opt_params(
            trial=trial, 
            opt_lvl=4,
            )
    model = model.fit(X_train, y_train)
    score = model.score(
    X_test, y_test,
    metric=sklearn.metrics.roc_auc_score,
    metric_round=4,
    print_metric=False,
    )
    return(score)

In [18]:
optuna.logging.disable_default_handler()

In [19]:
model = LightGBMClassifier(random_state=RANDOM_SEED)

In [20]:
start_score

0.722

In [21]:
study = optuna.create_study(
    direction='maximize',
    pruner=optuna.pruners.ThresholdPruner(lower=start_score)
    )

In [22]:
dafault_params = model.get_model_start_opt_params()

In [23]:
study.enqueue_trial(dafault_params)

In [24]:

study.optimize(lambda trial: objective(trial, model), timeout=500, show_progress_bar=False)

04:05:33 | Pruned on 0 fold
04:05:34 | Pruned on 0 fold
04:05:34 | Pruned on 0 fold
04:05:35 | Pruned on 0 fold
04:05:35 | Pruned on 0 fold
04:05:35 | Pruned on 0 fold
04:05:36 | Pruned on 0 fold
04:05:37 | Pruned on 0 fold
04:05:49 | Pruned on 0 fold
04:07:04 | Pruned on 0 fold
04:07:05 | Pruned on 0 fold
04:07:09 | Pruned on 0 fold
04:07:14 | Pruned on 0 fold
04:07:16 | Pruned on 0 fold
04:07:21 | Pruned on 0 fold
04:07:52 | Pruned on 0 fold
04:07:54 | Pruned on 0 fold
04:07:54 | Pruned on 0 fold
04:08:01 | Pruned on 0 fold
04:08:09 | Pruned on 0 fold
04:08:14 | Pruned on 0 fold
04:08:41 | Pruned on 0 fold
04:09:18 | Pruned on 0 fold
04:09:24 | Pruned on 0 fold
04:09:25 | Pruned on 0 fold
04:09:27 | Pruned on 0 fold
04:09:40 | Pruned on 0 fold
04:09:54 | Pruned on 0 fold
04:10:16 | Pruned on 0 fold
04:10:24 | Pruned on 0 fold
04:10:42 | Pruned on 0 fold
04:11:00 | Pruned on 0 fold
04:12:02 | Pruned on 0 fold
04:12:44 | Pruned on 0 fold
04:12:44 | Pruned on 0 fold
04:13:41 | Pruned on

In [25]:
study.best_params 

{'lgbm_min_child_samples': 3,
 'lgbm_bagging_fraction': 0.9,
 'lgbm_feature_fraction': 0.9,
 'lgbm_bagging_freq': 5,
 'lgbm_num_leaves': 14,
 'lgbm_learning_rate': 0.008293990812611252,
 'lgbm_boosting': 'gbdt',
 'lgbm_num_iterations': 600,
 'lgbm_objective': 'binary'}

In [26]:
optuna.visualization.plot_optimization_history(study)

In [27]:
optuna.visualization.plot_param_importances(study)

In [33]:
optuna.visualization.plot_slice(study)

In [37]:
df = study.trials_dataframe()

In [39]:
df

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_lgbm_bagging_fraction,params_lgbm_bagging_freq,params_lgbm_boosting,params_lgbm_drop_rate,params_lgbm_feature_fraction,...,params_lgbm_max_drop,params_lgbm_min_child_samples,params_lgbm_num_iterations,params_lgbm_num_leaves,params_lgbm_objective,params_lgbm_skip_drop,params_lgbm_uniform_drop,params_lgbm_xgboost_dart_mode,system_attrs_fixed_params,state
0,0,0.7084,2021-02-27 04:05:31.104612,2021-02-27 04:05:33.155673,0 days 00:00:02.051061,0.4,5.0,gbdt,,0.4,...,,20,300,31,cross_entropy,,,,"{'lgbm_num_leaves': 31, 'lgbm_min_child_sample...",PRUNED
1,1,0.6895,2021-02-27 04:05:33.167604,2021-02-27 04:05:34.468074,0 days 00:00:01.300470,0.7,3.0,dart,1.470119e-01,0.6,...,49.0,2,700,3,binary,0.061949,True,True,,PRUNED
2,2,0.6844,2021-02-27 04:05:34.469271,2021-02-27 04:05:34.708783,0 days 00:00:00.239512,0.5,6.0,dart,1.170914e-05,0.9,...,9.0,34,400,2,cross_entropy,0.266699,False,True,,PRUNED
3,3,0.6545,2021-02-27 04:05:34.708890,2021-02-27 04:05:35.123811,0 days 00:00:00.414921,0.5,11.0,dart,2.239867e-07,0.5,...,60.0,80,1100,2,cross_entropy,0.048539,False,True,,PRUNED
4,4,0.6902,2021-02-27 04:05:35.123929,2021-02-27 04:05:35.388900,0 days 00:00:00.264971,1.0,,gbdt,,,...,,5,700,2,cross_entropy,,,,,PRUNED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,96,0.7540,2021-02-27 04:13:44.815034,2021-02-27 04:13:46.600442,0 days 00:00:01.785408,0.7,7.0,gbdt,,1.0,...,,2,200,25,binary,,,,,COMPLETE
97,97,0.7668,2021-02-27 04:13:46.601343,2021-02-27 04:13:48.193296,0 days 00:00:01.591953,0.9,8.0,gbdt,,0.9,...,,2,300,15,binary,,,,,COMPLETE
98,98,0.7625,2021-02-27 04:13:48.194160,2021-02-27 04:13:50.526372,0 days 00:00:02.332212,0.9,5.0,gbdt,,0.9,...,,2,500,12,binary,,,,,COMPLETE
99,99,0.7215,2021-02-27 04:13:50.527248,2021-02-27 04:13:52.695120,0 days 00:00:02.167872,0.7,6.0,gbdt,,1.0,...,,2,1000,18,binary,,,,,PRUNED


In [38]:
df.nunique()

TypeError: unhashable type: 'dict'

In [36]:
df.state.value_counts()

COMPLETE    61
PRUNED      40
Name: state, dtype: int64

In [None]:
df