# Setup

In [None]:
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

In [None]:
token = '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'
usr = '243046'
repo = 'boost'

In [None]:
!git clone https://{token}@github.com/{usr}/{repo}

In [None]:
%cd boost

In [None]:
!pip install -r requirements.txt

# No search - all datasets

In [None]:
import warnings

import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from wrappers.datasets_models_wrappers import DataModelsWrapper, DataModelsWrapperRandomSearch
from wrappers.datasets_models_wrappers_nlp import DataModelsWrapperNLP, DataModelsWrapperNLPRandomSearch
from data_processing.process_dataset import prepare_datasets_for_classification
from data_processing.process_dataset_nlp import prepare_nlp_for_classification

warnings.filterwarnings('ignore')


def run(param_dict, mode='randomized', tuner='hyperopt', scoring='accuracy'):
    if mode == 'randomized':
        model = DataModelsWrapperRandomSearch(param_dict, scoring=scoring)
    elif mode == 'TPE':
        model = DataModelsWrapper(param_dict, tuner=tuner, scoring=scoring)
    model.fit()
    all_results = model.all_datasets_results_
    all_runtimes = model.all_datasets_runtimes_
    results_for_plotting = model.results_for_plotting_
    runtimes_for_plotting = model.runtimes_for_plotting_
    return all_results, all_runtimes, results_for_plotting, runtimes_for_plotting


def run_nlp(param_dict, mode='randomized', tuner='hyperopt', scoring='accuracy',
            tfidf_kws={'ngram_range': (1, 2), 'min_df': 3, 'max_features': 10000}):
    if mode == 'randomized':
        model = DataModelsWrapperNLPRandomSearch(param_dict, scoring=scoring, tfidf_kws=tfidf_kws)
    elif mode == 'TPE':
        model = DataModelsWrapperNLP(param_dict, tuner=tuner, scoring=scoring, tfidf_kws=tfidf_kws)
    model.fit()
    all_results = model.all_datasets_results_
    all_runtimes = model.all_datasets_runtimes_
    results_for_plotting = model.results_for_plotting_
    runtimes_for_plotting = model.runtimes_for_plotting_
    return all_results, all_runtimes, results_for_plotting, runtimes_for_plotting

In [None]:
d = {
    'mushrooms.csv': ('class', 'all', None),
    'adult.csv': ('profit', [], None),
    'churn.csv': ('Churn', [], None),
    'creditcard.csv': ('Class', [], None),
    'prostate.csv': ('target', [], None),
    'leukemia.csv': ('target', [], None),
    'weather_dataset.csv': ('target', [], 200)
}

X_1, y_1, X_2, y_2, X_3, y_3, X_4, y_4, X_5, y_5, X_6, y_6, X_7, y_7 = prepare_datasets_for_classification(d, data_path='data/')

X_8, y_8 = prepare_nlp_for_classification(
    dataset_name='imdb_dataset.csv',
    text_column='review_cleared',
    y_col='sentiment',
    nrows=2000,
    data_path='data/'
)

models = {
    'Gradient Boosting': (GradientBoostingClassifier(), {}),
    'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                              random_state=123), {}),
    'LightGBM': (LGBMClassifier(), {}),
    'CatBoost': (CatBoostClassifier(n_estimators=100, verbose=False,
                                    random_state=123), {})
}

param_dict = {
    'mushrooms': (X_1, y_1, models),
    'adult': (X_2, y_2, models),
    'churn': (X_3, y_3, models),
    'credit card': (X_4, y_4, models),
    'prostate': (X_5, y_5, models),
    'leukemia': (X_6, y_6, models),
    'weather': (X_7, y_7, models)
}

param_dict_nlp = {
    'IMDB reviews': (X_8, y_8, models)
}

tfidf_kws = {'ngram_range': (1, 2), 'min_df': 3, 'max_features': 10000}

all_results, all_runtimes, results_for_plotting, runtimes_for_plotting = run(param_dict=param_dict,
                                                                             mode='randomized', scoring='accuracy')

_, _, results_for_plotting_nlp, runtimes_for_plotting_nlp = run_nlp(param_dict=param_dict_nlp,
                                                                    mode='randomized',
                                                                    scoring='accuracy',
                                                                    tfidf_kws=tfidf_kws
                                                                    )

all_results = pd.concat([results_for_plotting, results_for_plotting_nlp])
all_runtimes = pd.concat([runtimes_for_plotting, runtimes_for_plotting_nlp])
all_results.to_excel('results_colab/results_no_search.xlsx', index=False)
all_runtimes.to_excel('results_colab/runtimes_no_search.xlsx', index=False)

In [None]:
files.download('results_colab/results_no_search.xlsx')
files.download('results_colab/runtimes_no_search.xlsx')

# Ordinary TPE

In [None]:
import warnings

from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from ray import tune

from wrappers.datasets_models_wrappers import DataModelsWrapper, DataModelsWrapperRandomSearch
from data_processing.process_dataset import prepare_datasets_for_classification
warnings.filterwarnings('ignore')


def run(param_dict, mode='TPE', tuner='hyperopt', scoring='accuracy'):
    if mode == 'randomized':
        model = DataModelsWrapperRandomSearch(param_dict, scoring=scoring)
    elif mode == 'TPE':
        model = DataModelsWrapper(param_dict, tuner=tuner, scoring=scoring)
    model.fit()
    all_results = model.all_datasets_results_
    all_runtimes = model.all_datasets_runtimes_
    results_for_plotting = model.results_for_plotting_
    runtimes_for_plotting = model.runtimes_for_plotting_
    return all_results, all_runtimes, results_for_plotting, runtimes_for_plotting

In [None]:
d = {
    'mushrooms.csv': ('class', 'all', None),
    'adult.csv': ('profit', [], None),
    'churn.csv': ('Churn', [], None),
    'creditcard.csv': ('Class', [], None)
}

X_1, y_1, X_2, y_2, X_3, y_3, X_4, y_4 = prepare_datasets_for_classification(d, data_path='data/')

boosting_params = {
    'n_estimators': tune.choice([50, 100, 150]),
    'learning_rate': tune.loguniform(0.01, 0.1)
}
xgb_params = {
    'n_estimators': tune.choice([50, 100, 150]),
    'learning_rate': tune.loguniform(0.01, 0.1)
}
lgbm_params = {
    'n_estimators': tune.choice([50, 100, 150]),
    'learning_rate': tune.loguniform(0.01, 0.1)
}
catboost_params = {
    'n_estimators': tune.choice([50, 100, 150]),
    'learning_rate': tune.loguniform(0.01, 0.1)
}

models = {
    'Gradient Boosting': (GradientBoostingClassifier(), boosting_params),
    'XGBoost': (XGBClassifier(use_label_encoder=False,
                             eval_metric='logloss', random_state=123), xgb_params),
    'LightGBM': (LGBMClassifier(), lgbm_params),
    'CatBoost': (CatBoostClassifier(verbose=False, random_state=123), catboost_params)
}

param_dict = {
    'mushrooms': (X_1, y_1, models),
    'adult': (X_2, y_2, models),
    'churn': (X_3, y_3, models),
    'credit card': (X_4, y_4, models)
}

all_results, all_runtimes, results_for_plotting, runtimes_for_plotting = run(param_dict=param_dict,
                                                                             mode='TPE', scoring='accuracy')

name = 'ordinary_TPE'
results_for_plotting.to_excel(f'results_colab/results_{name}.xlsx', index=False)
runtimes_for_plotting.to_excel(f'results_colab/runtimes_{name}.xlsx', index=False)

In [None]:
files.download(f'results_colab/results_{name}.xlsx')
files.download(f'results_colab/runtimes_{name}.xlsx')

# High dimensional TPE

In [None]:
import warnings

import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from ray import tune

from wrappers.datasets_models_wrappers import DataModelsWrapper, DataModelsWrapperRandomSearch
from wrappers.datasets_models_wrappers_nlp import DataModelsWrapperNLP, DataModelsWrapperNLPRandomSearch
from data_processing.process_dataset import prepare_datasets_for_classification
from data_processing.process_dataset_nlp import prepare_nlp_for_classification
warnings.filterwarnings('ignore')


def run(param_dict, mode='TPE', tuner='hyperopt', scoring='accuracy'):
    if mode == 'randomized':
        model = DataModelsWrapperRandomSearch(param_dict, scoring=scoring)
    elif mode == 'TPE':
        model = DataModelsWrapper(param_dict, tuner=tuner, scoring=scoring)
    model.fit()
    all_results = model.all_datasets_results_
    all_runtimes = model.all_datasets_runtimes_
    results_for_plotting = model.results_for_plotting_
    runtimes_for_plotting = model.runtimes_for_plotting_
    return all_results, all_runtimes, results_for_plotting, runtimes_for_plotting


def run_nlp(param_dict, mode='TPE', tuner='hyperopt', scoring='accuracy',
        tfidf_kws={'ngram_range': (1, 2), 'min_df': 3, 'max_features': 10000}):
    if mode == 'randomized':
        model = DataModelsWrapperNLPRandomSearch(param_dict, scoring=scoring, tfidf_kws=tfidf_kws)
    elif mode == 'TPE':
        model = DataModelsWrapperNLP(param_dict, tuner=tuner, scoring=scoring, tfidf_kws=tfidf_kws)
    model.fit()
    all_results = model.all_datasets_results_
    all_runtimes = model.all_datasets_runtimes_
    results_for_plotting = model.results_for_plotting_
    runtimes_for_plotting = model.runtimes_for_plotting_
    return all_results, all_runtimes, results_for_plotting, runtimes_for_plotting

In [None]:
d = {
    'prostate.csv': ('target', [], None),
    'leukemia.csv': ('target', [], None),
    'weather_dataset.csv': ('target', [], 500)
}

X_1, y_1, X_2, y_2, X_3, y_3 = prepare_datasets_for_classification(d, data_path='data/')

X_4, y_4 = prepare_nlp_for_classification(
    dataset_name='imdb_dataset.csv',
    text_column='review_cleared',
    y_col='sentiment',
    nrows=3000,
    data_path='data/'
)

boosting_params = {
    'subsample': tune.uniform(0.6, 0.8)
}
xgb_params = {
    'reg_alpha': tune.loguniform(1, 10),
    'reg_lambda': tune.loguniform(1, 10),
    'gamma': tune.uniform(0.5, 2)
}
lgbm_params = {
    'reg_alpha': tune.loguniform(1, 10),
    'reg_lambda': tune.loguniform(1, 10)
}
catboost_params = {
    'reg_lambda': tune.loguniform(3, 10)
}

models = {
    'Gradient Boosting': (GradientBoostingClassifier(max_features='sqrt'), boosting_params),
    'XGBoost': (XGBClassifier(colsample_bynode=0.5, use_label_encoder=False,
                              eval_metric='logloss', random_state=123), xgb_params),
    'LightGBM': (LGBMClassifier(colsample_bynode=0.5), lgbm_params),
    'CatBoost': (CatBoostClassifier(colsample_bylevel=0.5, n_estimators=100,
                                    verbose=False, random_state=123), catboost_params)
}

param_dict = {
    'prostate': (X_1, y_1, models),
    'leukemia': (X_2, y_2, models),
    'weather': (X_3, y_3, models)
}

param_dict_nlp = {
    'IMDB reviews': (X_4, y_4, models)
}

tfidf_kws = {'ngram_range': (1, 2), 'min_df': 3, 'max_features': 10000}

all_results, all_runtimes, results_for_plotting, runtimes_for_plotting = run(param_dict=param_dict,
                                                                             mode='TPE', scoring='accuracy')

_, _, results_for_plotting_nlp, runtimes_for_plotting_nlp = run_nlp(param_dict=param_dict_nlp,
                                                                    mode='TPE',
                                                                    scoring='accuracy',
                                                                    tfidf_kws=tfidf_kws
                                                                    )

name = 'high_dimensional_TPE'
all_results = pd.concat([results_for_plotting, results_for_plotting_nlp])
all_runtimes = pd.concat([runtimes_for_plotting, runtimes_for_plotting_nlp])
all_results.to_excel(f'results_colab/results_{name}.xlsx', index=False)
all_runtimes.to_excel(f'results_colab/runtimes_{name}.xlsx', index=False)

In [None]:
files.download(f'results_colab/results_{name}.xlsx')
files.download(f'results_colab/runtimes_{name}.xlsx')

 # Ordinary

In [None]:
import warnings

from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy import stats

from wrappers.datasets_models_wrappers import DataModelsWrapper, DataModelsWrapperRandomSearch
from data_processing.process_dataset import prepare_datasets_for_classification
warnings.filterwarnings('ignore')


def run(param_dict, mode='randomized', tuner='hyperopt', scoring='accuracy'):
    if mode == 'randomized':
        model = DataModelsWrapperRandomSearch(param_dict, scoring=scoring)
    elif mode == 'TPE':
        model = DataModelsWrapper(param_dict, tuner=tuner, scoring=scoring)
    model.fit()
    all_results = model.all_datasets_results_
    all_runtimes = model.all_datasets_runtimes_
    results_for_plotting = model.results_for_plotting_
    runtimes_for_plotting = model.runtimes_for_plotting_
    return all_results, all_runtimes, results_for_plotting, runtimes_for_plotting

In [None]:
d = {
    'mushrooms.csv': ('class', 'all', None),
    'adult.csv': ('profit', [], None),
    'churn.csv': ('Churn', [], None),
    'creditcard.csv': ('Class', [], None)
}

X_1, y_1, X_2, y_2, X_3, y_3, X_4, y_4 = prepare_datasets_for_classification(d, data_path='data/')

boosting_params = {
    'n_estimators': [50, 100, 150],
    'learning_rate': stats.loguniform(0.01, 0.1)
}
xgb_params = {
    'n_estimators': [50, 100, 150],
    'learning_rate': stats.loguniform(0.01, 0.1)
}
lgbm_params = {
    'n_estimators': [50, 100, 150],
    'learning_rate': stats.loguniform(0.01, 0.1)
}
catboost_params = {
    'n_estimators': [50, 100, 150],
    'learning_rate': stats.loguniform(0.01, 0.1)
}

models = {
    'Gradient Boosting': (GradientBoostingClassifier(), boosting_params),
    'XGBoost': (XGBClassifier(use_label_encoder=False,
                             eval_metric='logloss', random_state=123), xgb_params),
    'LightGBM': (LGBMClassifier(), lgbm_params),
    'CatBoost': (CatBoostClassifier(verbose=False, random_state=123), catboost_params)
}

param_dict = {
    'mushrooms': (X_1, y_1, models),
    'adult': (X_2, y_2, models),
    'churn': (X_3, y_3, models),
    'credit card': (X_4, y_4, models)
}

all_results, all_runtimes, results_for_plotting, runtimes_for_plotting = run(param_dict=param_dict,
                                                                             mode='randomized', scoring='accuracy')

name = 'ordinary'
results_for_plotting.to_excel(f'../results/results_{name}.xlsx', index=False)
runtimes_for_plotting.to_excel(f'../results/runtimes_{name}.xlsx', index=False)

In [None]:
files.download(f'results_colab/results_{name}.xlsx')
files.download(f'results_colab/runtimes_{name}.xlsx')

# High dimensional

In [None]:
import warnings

import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy import stats

from wrappers.datasets_models_wrappers import DataModelsWrapper, DataModelsWrapperRandomSearch
from wrappers.datasets_models_wrappers_nlp import DataModelsWrapperNLP, DataModelsWrapperNLPRandomSearch
from data_processing.process_dataset import prepare_datasets_for_classification
from data_processing.process_dataset_nlp import prepare_nlp_for_classification
warnings.filterwarnings('ignore')


def run(param_dict, mode='randomized', tuner='hyperopt', scoring='accuracy'):
    if mode == 'randomized':
        model = DataModelsWrapperRandomSearch(param_dict, scoring=scoring)
    elif mode == 'TPE':
        model = DataModelsWrapper(param_dict, tuner=tuner, scoring=scoring)
    model.fit()
    all_results = model.all_datasets_results_
    all_runtimes = model.all_datasets_runtimes_
    results_for_plotting = model.results_for_plotting_
    runtimes_for_plotting = model.runtimes_for_plotting_
    return all_results, all_runtimes, results_for_plotting, runtimes_for_plotting


def run_nlp(param_dict, mode='randomized', tuner='hyperopt', scoring='accuracy',
        tfidf_kws={'ngram_range': (1, 2), 'min_df': 3, 'max_features': 10000}):
    if mode == 'randomized':
        model = DataModelsWrapperNLPRandomSearch(param_dict, scoring=scoring, tfidf_kws=tfidf_kws)
    elif mode == 'TPE':
        model = DataModelsWrapperNLP(param_dict, tuner=tuner, scoring=scoring, tfidf_kws=tfidf_kws)
    model.fit()
    all_results = model.all_datasets_results_
    all_runtimes = model.all_datasets_runtimes_
    results_for_plotting = model.results_for_plotting_
    runtimes_for_plotting = model.runtimes_for_plotting_
    return all_results, all_runtimes, results_for_plotting, runtimes_for_plotting

In [None]:
d = {
    'prostate.csv': ('target', [], None),
    'leukemia.csv': ('target', [], None),
    'weather_dataset.csv': ('target', [], 500)
}

X_1, y_1, X_2, y_2, X_3, y_3 = prepare_datasets_for_classification(d, data_path='data/')

X_4, y_4 = prepare_nlp_for_classification(
    dataset_name='imdb_dataset.csv',
    text_column='review_cleared',
    y_col='sentiment',
    nrows=3000,
    data_path='data/'
)

boosting_params = {
    'subsample': stats.uniform(0.6, 0.8)
}
xgb_params = {
    'reg_alpha': stats.loguniform(1, 10),
    'reg_lambda': stats.loguniform(1, 10),
    'gamma': stats.uniform(0.5, 2)
}
lgbm_params = {
    'reg_alpha': stats.loguniform(1, 10),
    'reg_lambda': stats.loguniform(1, 10)
}
catboost_params = {
    'reg_lambda': stats.loguniform(3, 10)
}

models = {
    'Gradient Boosting': (GradientBoostingClassifier(max_features='sqrt'), boosting_params),
    'XGBoost': (XGBClassifier(colsample_bynode=0.5, use_label_encoder=False,
                              eval_metric='logloss', random_state=123), xgb_params),
    'LightGBM': (LGBMClassifier(colsample_bynode=0.5), lgbm_params),
    'CatBoost': (CatBoostClassifier(colsample_bylevel=0.5, n_estimators=100,
                                    verbose=False, random_state=123), catboost_params)
}

param_dict = {
    'prostate': (X_1, y_1, models),
    'leukemia': (X_2, y_2, models),
    'weather': (X_3, y_3, models)
}

param_dict_nlp = {
    'IMDB reviews': (X_4, y_4, models)
}

tfidf_kws = {'ngram_range': (1, 2), 'min_df': 3, 'max_features': 10000}

all_results, all_runtimes, results_for_plotting, runtimes_for_plotting = run(param_dict=param_dict,
                                                                             mode='randomized', scoring='accuracy')

_, _, results_for_plotting_nlp, runtimes_for_plotting_nlp = run_nlp(param_dict=param_dict_nlp,
                                                                    mode='randomized',
                                                                    scoring='accuracy',
                                                                    tfidf_kws=tfidf_kws
                                                                    )

name = 'high_dimensional'
all_results = pd.concat([results_for_plotting, results_for_plotting_nlp])
all_runtimes = pd.concat([runtimes_for_plotting, runtimes_for_plotting_nlp])
all_results.to_excel(f'../results/results_{name}.xlsx', index=False)
all_runtimes.to_excel(f'../results/runtimes_{name}.xlsx', index=False)

In [None]:
files.download(f'results_colab/results_{name}.xlsx')
files.download(f'results_colab/runtimes_{name}.xlsx')

# Runtimes dependent on data size

In [None]:
from time import time

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import pandas as pd
import seaborn as sns
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.datasets import make_classification
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from visualization.palettes import default_palette

sns.set_style('whitegrid')


def runtimes_features(models, n_features_list, n_samples=100):
    cols = models.keys()
    results = []
    for n_features in n_features_list:
        X, y = make_classification(n_samples=n_samples, n_features=n_features)
        record = []
        for model in models.copy().values():
            t0 = time()
            model.fit(X, y)
            t = time()
            record.append(t-t0)
        results.append(record)
    results = pd.DataFrame(results, columns=cols)
    results['n_features'] = n_features_list
    return results


def runtimes_samples(models, n_samples_list, n_features=10):
    cols = models.keys()
    results = []
    for n_samples in n_samples_list:
        X, y = make_classification(n_samples=n_samples, n_features=n_features)
        record = []
        for model in models.copy().values():
            t0 = time()
            model.fit(X, y)
            t = time()
            record.append(t-t0)
        results.append(record)
    results = pd.DataFrame(results, columns=cols)
    results['n_samples'] = n_samples_list
    return results


def visualize_results(
        results_features,
        results_samples,
        out_path='../plots/runtimes_features_samples.pdf',
        figsize=(12, 8),
        base=10,
        save=False,
        **kwargs
):

    fig, ax = plt.subplots(1, 2, figsize=figsize)

    melted_features = results_features.melt(id_vars='n_features', value_name='runtime', var_name='model')
    sns.lineplot(data=melted_features, x='n_features', y='runtime', hue='model', marker='o', ax=ax[0], **kwargs)
    ax[0].yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x)}s'))
    ax[0].set_xscale('log', base=base)
    ax[0].set_xlabel(r'$n_{features}$')

    melted_samples = results_samples.melt(id_vars='n_samples', value_name='runtime', var_name='model')
    sns.lineplot(data=melted_samples, x='n_samples', y='runtime', hue='model', marker='o', ax=ax[1], **kwargs)
    ax[1].yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x)}s'))
    ax[1].set_xscale('log', base=base)
    ax[1].set_xlabel(r'$n_{samples}$')

    if save:
        fig.savefig(out_path, bbox_inches='tight')


if __name__ == '__main__':
    n_estimators = 100
    models = {
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=n_estimators),
        'XGBoost': XGBClassifier(n_estimators=n_estimators, use_label_encoder=False, eval_metric='logloss', random_state=123),
        'LightGBM': LGBMClassifier(n_estimators=n_estimators),
        'CatBoost Ordered': CatBoostClassifier(boosting_type='Ordered', n_estimators=n_estimators,
                                               verbose=False, random_state=123)
        'CatBoost Plain': CatBoostClassifier(boosting_type='Plain', n_estimators=n_estimators,
                                               verbose=False, random_state=123)
    }

    base = 5
    n_features_list = base**np.arange(1, 7)
    n_samples_list = base**np.arange(1, 9)

    results_features = runtimes_features(models, n_features_list)
    print('features done')
    results_samples = runtimes_samples(models, n_samples_list)
    print('samples done')
    visualize_results(results_features, results_samples, base=base, palette='rainbow',
                      save=True, out_path='../plots_colab/runtimes_features_samples.pdf')


# LightGBM & XGBoost kwargs (legit)

In [None]:
import warnings

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy import stats

from wrappers.datasets_models_wrappers import DataModelsWrapper, DataModelsWrapperRandomSearch
from data_processing.process_dataset import prepare_datasets_for_classification
warnings.filterwarnings('ignore')


def run(param_dict, mode='randomized', tuner='hyperopt', scoring='accuracy'):
    if mode == 'randomized':
        model = DataModelsWrapperRandomSearch(param_dict, scoring=scoring)
    elif mode == 'TPE':
        model = DataModelsWrapper(param_dict, tuner=tuner, scoring=scoring)
    model.fit()
    all_results = model.all_datasets_results_
    all_runtimes = model.all_datasets_runtimes_
    results_for_plotting = model.results_for_plotting_
    runtimes_for_plotting = model.runtimes_for_plotting_
    return all_results, all_runtimes, results_for_plotting, runtimes_for_plotting


X_1, y_1 = make_classification()

xgb_params = {
    'booster': ['dart', 'hist']
}
lgbm_params = {
    'extra_trees': [True, False]
}
lgbm_params1 = {
    'extra_trees': [False, True]
}

models = {
    'XGBoost': (XGBClassifier(booster='dart', rate_drop=.5, use_label_encoder=False,
                              eval_metric='logloss', random_state=123), xgb_params),
    'LightGBM': (LGBMClassifier(), lgbm_params),
    'LightGBM2': (LGBMClassifier(), lgbm_params1)
}

param_dict = {
    'mushrooms': (X_1, y_1, models)
}

all_results, all_runtimes, results_for_plotting, runtimes_for_plotting = run(param_dict=param_dict,
                                                                              mode='randomized', scoring='accuracy')