# Setup

In [None]:
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

In [None]:
COLAB_DATA_PATH = '../drive/MyDrive/data_colab/'

In [None]:
usr = '243046'
repo = 'boost'

In [None]:
!git clone https://github.com/{usr}/{repo}

In [None]:
%cd boost

In [None]:
!pip install -r requirements.txt

# All 12 datasets - no tuning
## with 150/50 trees

In [None]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from data_processing.process_dataset import prepare_datasets_for_classification
from data_processing.process_dataset_nlp import prepare_nlp_for_classification
from experiments_runners.basic_runners import run, run_nlp


d = {
    'adult_full.csv': ('profit', [], None),
    'heart.csv': ('target', ['cp', 'restecg'], None),
    'amazon.csv': ('ACTION', 'all', None),
    'mushrooms.csv': ('class', 'all', None),
    'breast_cancer.csv': ('target', [], None),
    'churn.csv': ('Churn', [], None),
    'creditcard_full.csv': ('Class', [], 30000),
    'prostate.csv': ('target', [], None),
    'leukemia.csv': ('target', [], None),
    'gina_agnostic.csv': ('target', [], None),
    'weather_dataset_full_2500_grayscale.csv': ('target', [], None)
}

X_1, y_1, X_2, y_2, X_3, y_3, X_4, y_4, X_5, y_5, X_6, y_6, X_7, y_7, X_8, y_8,\
    X_9, y_9, X_10, y_10, X_11, y_11 = prepare_datasets_for_classification(d, data_path=COLAB_DATA_PATH)

X_12, y_12 = prepare_nlp_for_classification(
    dataset_name='imdb_dataset_full.csv',
    text_column='review_cleared',
    y_col='sentiment',
    nrows=10000, 
    data_path=COLAB_DATA_PATH
)

n_estimators_default = 150

boosting_init_default = {
    'n_estimators': n_estimators_default
}
xgb_init_default = {
    'objective': 'reg:logistic',
    'n_estimators': n_estimators_default
}
lgbm_init_default = {
    'boosting_type': 'goss',
    'n_estimators': n_estimators_default
}
catboost_init_default = {
    'boosting_type': 'Ordered',
    'n_estimators': n_estimators_default,
    'thread_count': 1
}

models_default = {
    'Gradient Boosting': (GradientBoostingClassifier(**boosting_init_default), {}),
    'XGBoost': (XGBClassifier(**xgb_init_default, use_label_encoder=False,
                             eval_metric='logloss', random_state=123), {}),
    'LightGBM': (LGBMClassifier(**lgbm_init_default), {}),
    'CatBoost': (CatBoostClassifier(**catboost_init_default, verbose=False, random_state=123), {})
}

n_estimators_reg_microarray = 150

boosting_init_reg_microarray = {
    'n_estimators': n_estimators_reg_microarray
}
xgb_init_reg_microarray = {
    'objective': 'reg:logistic',
    'n_estimators': n_estimators_reg_microarray
}
lgbm_init_reg_microarray = {
    'boosting_type': 'goss',
    'n_estimators': n_estimators_reg_microarray
}
catboost_init_reg_microarray = {
    'boosting_type': 'Plain',
    'n_estimators': n_estimators_reg_microarray,
    'thread_count': 1
}

models_reg_microarray = {
    'Gradient Boosting': (GradientBoostingClassifier(**boosting_init_reg_microarray), {}),
    'XGBoost': (XGBClassifier(**xgb_init_reg_microarray, use_label_encoder=False, eval_metric='logloss', random_state=123), {}),
    'LightGBM': (LGBMClassifier(**lgbm_init_reg_microarray), {}),
    'CatBoost': (
    CatBoostClassifier(**catboost_init_reg_microarray, verbose=False, random_state=123), {})
}

n_estimators_reg_image_nlp = 50

boosting_init_reg_image_nlp = {
    'n_estimators': n_estimators_reg_image_nlp
}
xgb_init_reg_image_nlp = {
    'objective': 'reg:logistic',
    'n_estimators': n_estimators_reg_image_nlp
}
lgbm_init_reg_image_nlp = {
    'boosting_type': 'goss',
    'n_estimators': n_estimators_reg_image_nlp
}
catboost_init_reg_image_nlp = {
    'boosting_type': 'Plain',
    'n_estimators': n_estimators_reg_image_nlp,
    'thread_count': 1
}

models_reg_image_nlp = {
    'Gradient Boosting': (GradientBoostingClassifier(**boosting_init_reg_image_nlp), {}),
    'XGBoost': (XGBClassifier(**xgb_init_reg_image_nlp, use_label_encoder=False,
                              eval_metric='logloss', random_state=123), {}),
    'LightGBM': (LGBMClassifier(**lgbm_init_reg_image_nlp), {}),
    'CatBoost': (CatBoostClassifier(**catboost_init_reg_image_nlp, verbose=False, random_state=123), {})
}

boosting_init_reg_weather = {
    'n_estimators': n_estimators_reg_image_nlp
}
xgb_init_reg_weather = {
    'objective': 'multi:softmax',
    'num_class': len(set(y_11)),
    'n_estimators': n_estimators_reg_image_nlp
}
lgbm_init_reg_weather = {
    'objective': 'softmax',
    'num_class': len(set(y_11)),
    'boosting_type': 'goss',
    'n_estimators': n_estimators_reg_image_nlp
}
catboost_init_reg_weather = {
    'boosting_type': 'Plain',
    'n_estimators': n_estimators_reg_image_nlp,
    'thread_count': 1
}

models_reg_weather = {
    'Gradient Boosting': (GradientBoostingClassifier(**boosting_init_reg_weather), {}),
    'XGBoost': (XGBClassifier(**xgb_init_reg_weather, use_label_encoder=False,
                              eval_metric='logloss', random_state=123), {}),
    'LightGBM': (LGBMClassifier(**lgbm_init_reg_weather), {}),
    'CatBoost': (CatBoostClassifier(**catboost_init_reg_weather, verbose=False, random_state=123), {})
}

param_dict = {
    'adult': (X_1, y_1, models_default.copy()),
    'heart': (X_2, y_2, models_default.copy()),
    'amazon': (X_3, y_3, models_default.copy()),
    'mushrooms': (X_4, y_4, models_default.copy()),
    'breast cancer': (X_5, y_5, models_default.copy()),
    'churn': (X_6, y_6, models_default.copy()),
    'creditcard': (X_7, y_7, models_default.copy()),
    'prostate': (X_8, y_8, models_reg_microarray.copy()),
    'leukemia': (X_9, y_9, models_reg_microarray.copy()),
    'gina agnostic': (X_10, y_10, models_reg_image_nlp.copy()),
    'weather dataset': (X_11, y_11, models_reg_weather.copy())
}

param_dict_nlp = {
    'IMDB reviews': (X_12, y_12, models_reg_image_nlp.copy())
}

tfidf_kws = {'ngram_range': (1, 2), 'min_df': 3, 'max_features': 10000}

results, tuning_times, runtimes = run(param_dict=param_dict, mode='TPE')

results_nlp, tuning_times_nlp, runtimes_nlp = run_nlp(param_dict=param_dict_nlp, mode='TPE', tfidf_kws=tfidf_kws)

name = '12_datasets_no_tuning_150_50_trees'
for scoring in results:
    path_to_save = f'results_{scoring}_{name}.xlsx'
    pd.concat([results[scoring], results_nlp[scoring]]).to_excel(path_to_save, index=False)
all_tuning_times = pd.concat([tuning_times, tuning_times_nlp])
all_runtimes = pd.concat([runtimes, runtimes_nlp])
all_tuning_times.to_excel(f'tuning_times_{name}.xlsx', index=False)
all_runtimes.to_excel(f'runtimes_{name}.xlsx', index=False)

In [None]:
for scoring in results:
    files.download(f'results_{scoring}_{name}.xlsx')
files.download(f'tuning_times_{name}.xlsx')
files.download(f'runtimes_{name}.xlsx')

# All 12 datasets - with TPE tuning
## with 150/50 trees

In [None]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from ray import tune

from data_processing.process_dataset import prepare_datasets_for_classification
from data_processing.process_dataset_nlp import prepare_nlp_for_classification
from experiments_runners.basic_runners import run, run_nlp


d = {
    'adult_full.csv': ('profit', [], None),
    'heart.csv': ('target', ['cp', 'restecg'], None),
    'amazon.csv': ('ACTION', 'all', None),
    'mushrooms.csv': ('class', 'all', None),
    'breast_cancer.csv': ('target', [], None),
    'churn.csv': ('Churn', [], None),
    'creditcard_full.csv': ('Class', [], 30000),
    'prostate.csv': ('target', [], None),
    'leukemia.csv': ('target', [], None),
    'gina_agnostic.csv': ('target', [], None),
    'weather_dataset_full_2500_grayscale.csv': ('target', [], None)
}

X_1, y_1, X_2, y_2, X_3, y_3, X_4, y_4, X_5, y_5, X_6, y_6, X_7, y_7, X_8, y_8,\
    X_9, y_9, X_10, y_10, X_11, y_11 = prepare_datasets_for_classification(d, data_path=COLAB_DATA_PATH)

X_12, y_12 = prepare_nlp_for_classification(
    dataset_name='imdb_dataset_full.csv',
    text_column='review_cleared',
    y_col='sentiment',
    nrows=10000, 
    data_path=COLAB_DATA_PATH
)

boosting_params_default = {
    'max_depth': [2, 3, 4, 5, 8, 10],
    'learning_rate': tune.loguniform(0.01, 0.3),
    'min_samples_split': [2, 5, 10]
}
xgb_params_default = {
    'max_depth': [2, 3, 4, 5, 8, 10],
    'learning_rate': tune.loguniform(0.01, 0.3),
    'gamma': tune.uniform(0, 3),
    'alpha': tune.uniform(0, 1),
    'lambda': tune.uniform(0, 3)
}
lgbm_params_default = {
    'learning_rate': tune.loguniform(0.01, 0.3),
    'num_leaves': [3, 7, 15, 31, 127],
    'top_rate': tune.uniform(0.1, 0.5),
    'other_rate': tune.uniform(0.05, 0.2),
    'reg_alpha': tune.uniform(0, 1),
    'reg_lambda': tune.uniform(0, 3)
}
catboost_params_default = {
    'max_depth': [2, 3, 4, 5, 8, 10],
    'l2_leaf_reg': tune.uniform(0, 5),
    'leaf_estimation_iterations': [1, 10]
}

n_estimators_default = 150
subsample_default = 0.75
colsample_bynode_default = 0.6

boosting_init_default = {
    'n_estimators': n_estimators_default,
    'subsample': subsample_default,
    'max_features': colsample_bynode_default
}
xgb_init_default = {
    'objective': 'reg:logistic',
    'n_estimators': n_estimators_default,
    'subsample': subsample_default,
    'colsample_bynode': colsample_bynode_default
}
lgbm_init_default = {
    'boosting_type': 'goss',
    'n_estimators': n_estimators_default,
    'colsample_bynode': colsample_bynode_default
}
catboost_init_default = {
    'boosting_type': 'Ordered',
    'n_estimators': n_estimators_default,
    'subsample': subsample_default,
    'colsample_bylevel': colsample_bynode_default,
    'thread_count': 1
}

models_default = {
    'Gradient Boosting': (GradientBoostingClassifier(**boosting_init_default), boosting_params_default),
    'XGBoost': (XGBClassifier(**xgb_init_default, use_label_encoder=False,
                             eval_metric='logloss', random_state=123), xgb_params_default),
    'LightGBM': (LGBMClassifier(**lgbm_init_default), lgbm_params_default),
    'CatBoost': (CatBoostClassifier(**catboost_init_default, verbose=False, random_state=123), catboost_params_default)
}

boosting_init_default_small_rows = boosting_init_default.copy()
boosting_init_default_small_rows.update({'subsample': 1})
xgb_init_default_small_rows = xgb_init_default.copy()
xgb_init_default_small_rows.update({'subsample': 1})
lgbm_init_default_small_rows = lgbm_init_default.copy()
catboost_init_default_small_rows = catboost_init_default.copy()
catboost_init_default_small_rows.update({'subsample': 1})

models_default_small_rows = {
    'Gradient Boosting': (GradientBoostingClassifier(**boosting_init_default_small_rows), boosting_params_default),
    'XGBoost': (XGBClassifier(**xgb_init_default_small_rows, use_label_encoder=False,
                              eval_metric='logloss', random_state=123), xgb_params_default),
    'LightGBM': (LGBMClassifier(**lgbm_init_default_small_rows), lgbm_params_default),
    'CatBoost': (CatBoostClassifier(**catboost_init_default_small_rows, verbose=False, random_state=123), catboost_params_default)
}

boosting_params_reg = {
    'max_depth': [2, 3, 4, 5, 8, 10],
    'learning_rate': tune.loguniform(0.01, 0.3),
    'min_samples_split': [2, 5, 10]
}
xgb_params_reg = {
    'max_depth': [2, 3, 4, 5, 8, 10],
    'learning_rate': tune.loguniform(0.01, 0.3),
    'gamma': tune.uniform(0, 10),
    'alpha': tune.uniform(0, 5),
    'lambda': tune.uniform(0, 10)
}
lgbm_params_reg = {
    'learning_rate': tune.loguniform(0.01, 0.3),
    'num_leaves': [3, 7, 15, 31, 127],
    'top_rate': tune.uniform(0.1, 0.5),
    'other_rate': tune.uniform(0.05, 0.2),
    'reg_alpha': tune.uniform(0, 5),
    'reg_lambda': tune.uniform(0, 10)
}
catboost_params_reg = {
    'max_depth': [2, 3, 4, 5, 8, 10],
    'l2_leaf_reg': tune.uniform(0, 12),
    'leaf_estimation_iterations': [1, 10]
}

n_estimators_reg_microarray = 150
subsample_reg_microarray = 1
colsample_bynode_reg_microarray = 0.4

boosting_init_reg_microarray = {
    'n_estimators': n_estimators_reg_microarray,
    'subsample': subsample_reg_microarray,
    'max_features': colsample_bynode_reg_microarray
}
xgb_init_reg_microarray = {
    'objective': 'reg:logistic',
    'n_estimators': n_estimators_reg_microarray,
    'subsample': subsample_reg_microarray,
    'colsample_bynode': colsample_bynode_reg_microarray
}
lgbm_init_reg_microarray = {
    'boosting_type': 'goss',
    'n_estimators': n_estimators_reg_microarray,
    'colsample_bynode': colsample_bynode_reg_microarray
}
catboost_init_reg_microarray = {
    'boosting_type': 'Plain',
    'n_estimators': n_estimators_reg_microarray,
    'subsample': subsample_reg_microarray,
    'colsample_bylevel': colsample_bynode_reg_microarray,
    'thread_count': 1
}

models_reg_microarray = {
    'Gradient Boosting': (GradientBoostingClassifier(**boosting_init_reg_microarray), boosting_params_reg),
    'XGBoost': (XGBClassifier(**xgb_init_reg_microarray, use_label_encoder=False,
                              eval_metric='logloss', random_state=123), xgb_params_reg),
    'LightGBM': (LGBMClassifier(**lgbm_init_reg_microarray), lgbm_params_reg),
    'CatBoost': (
    CatBoostClassifier(**catboost_init_reg_microarray, verbose=False, random_state=123), catboost_params_reg)
}

n_estimators_reg_image_nlp = 50
subsample_reg_image_nlp = 0.5
colsample_bynode_reg_image_nlp = 0.4

boosting_init_reg_image_nlp = {
    'n_estimators': n_estimators_reg_image_nlp,
    'subsample': subsample_reg_image_nlp,
    'max_features': colsample_bynode_reg_image_nlp
}
xgb_init_reg_image_nlp = {
    'objective': 'reg:logistic',
    'n_estimators': n_estimators_reg_image_nlp,
    'subsample': subsample_reg_image_nlp,
    'colsample_bynode': colsample_bynode_reg_image_nlp
}
lgbm_init_reg_image_nlp = {
    'boosting_type': 'goss',
    'n_estimators': n_estimators_reg_image_nlp,
    'colsample_bynode': colsample_bynode_reg_image_nlp
}
catboost_init_reg_image_nlp = {
    'boosting_type': 'Plain',
    'n_estimators': n_estimators_reg_image_nlp,
    'subsample': subsample_reg_image_nlp,
    'colsample_bylevel': colsample_bynode_reg_image_nlp,
    'thread_count': 1
}

models_reg_image_nlp = {
    'Gradient Boosting': (GradientBoostingClassifier(**boosting_init_reg_image_nlp), boosting_params_reg),
    'XGBoost': (XGBClassifier(**xgb_init_reg_image_nlp, use_label_encoder=False,
                              eval_metric='logloss', random_state=123), xgb_params_reg),
    'LightGBM': (LGBMClassifier(**lgbm_init_reg_image_nlp), lgbm_params_reg),
    'CatBoost': (CatBoostClassifier(**catboost_init_reg_image_nlp, bootstrap_type='Bernoulli', verbose=False,
                                    random_state=123), catboost_params_reg)
}

boosting_init_reg_weather = {
    'n_estimators': n_estimators_reg_image_nlp,
    'subsample': subsample_reg_image_nlp,
    'max_features': colsample_bynode_reg_image_nlp
}
xgb_init_reg_weather = {
    'objective': 'multi:softmax',
    'num_class': len(set(y_11)),
    'n_estimators': n_estimators_reg_image_nlp,
    'subsample': subsample_reg_image_nlp,
    'colsample_bynode': colsample_bynode_reg_image_nlp
}
lgbm_init_reg_weather = {
    'objective': 'softmax',
    'num_class': len(set(y_11)),
    'boosting_type': 'goss',
    'n_estimators': n_estimators_reg_image_nlp,
    'colsample_bynode': colsample_bynode_reg_image_nlp
}
catboost_init_reg_weather = {
    'boosting_type': 'Plain',
    'n_estimators': n_estimators_reg_image_nlp,
    'subsample': subsample_reg_image_nlp,
    'colsample_bylevel': colsample_bynode_reg_image_nlp,
    'thread_count': 1
}

models_reg_weather = {
    'Gradient Boosting': (GradientBoostingClassifier(**boosting_init_reg_weather), boosting_params_reg),
    'XGBoost': (XGBClassifier(**xgb_init_reg_weather, use_label_encoder=False,
                              eval_metric='logloss', random_state=123), xgb_params_reg),
    'LightGBM': (LGBMClassifier(**lgbm_init_reg_weather), lgbm_params_reg),
    'CatBoost': (CatBoostClassifier(**catboost_init_reg_weather, bootstrap_type='Bernoulli', verbose=False,
                                    random_state=123), catboost_params_reg)
}

param_dict = {
    'adult': (X_1, y_1, models_default.copy()),
    'heart': (X_2, y_2, models_default_small_rows.copy()),
    'amazon': (X_3, y_3, models_default.copy()),
    'mushrooms': (X_4, y_4, models_default.copy()),
    'breast cancer': (X_5, y_5, models_default_small_rows.copy()),
    'churn': (X_6, y_6, models_default.copy()),
    'creditcard': (X_7, y_7, models_default.copy()),
    'prostate': (X_8, y_8, models_reg_microarray.copy()),
    'leukemia': (X_9, y_9, models_reg_microarray.copy()),
    'gina agnostic': (X_10, y_10, models_reg_image_nlp.copy()),
    'weather dataset': (X_11, y_11, models_reg_weather.copy())
}

param_dict_nlp = {
    'IMDB reviews': (X_12, y_12, models_reg_image_nlp.copy())
}

tfidf_kws = {'ngram_range': (1, 2), 'min_df': 3, 'max_features': 10000}

results, tuning_times, runtimes = run(param_dict=param_dict, mode='TPE')

results_nlp, tuning_times_nlp, runtimes_nlp = run_nlp(param_dict=param_dict_nlp, mode='TPE', tfidf_kws=tfidf_kws)

name = '12_datasets_TPE_150_50_trees'
for scoring in results:
    path_to_save = f'results_{scoring}_{name}.xlsx'
    pd.concat([results[scoring], results_nlp[scoring]]).to_excel(path_to_save, index=False)
all_tuning_times = pd.concat([tuning_times, tuning_times_nlp])
all_runtimes = pd.concat([runtimes, runtimes_nlp])
all_tuning_times.to_excel(f'tuning_times_{name}.xlsx', index=False)
all_runtimes.to_excel(f'runtimes_{name}.xlsx', index=False)

In [None]:
for scoring in results:
    files.download(f'results_{scoring}_{name}.xlsx')
files.download(f'tuning_times_{name}.xlsx')
files.download(f'runtimes_{name}.xlsx')