In [1]:
import pandas as pd
import numpy as np
import optuna
import time
import warnings
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from tqdm import tqdm


warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [2]:
train = pd.read_excel("data/train_prepared.xlsx", index_col=0)
test = pd.read_excel("data/test_prepared.xlsx", index_col=0)

x_train = train.drop(columns=['Transported'])
y_train = train[['Transported']]

x_test = test.drop(columns=['Transported'])
y_test = test[['Transported']]

### <center>RandomizedSearchCV on RandomForestClassifier

In [3]:
rf_clf = RandomForestClassifier(random_state=42)

rf_random_params = {
    'n_estimators': np.arange(400, 701, 100),
    'max_depth': [None] + list(range(5, 51, 5)),
    'max_features': list(range(3, 6)),
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': list(range(1, 21)),
    'min_samples_split': list(range(2, 21)),
    'class_weight': [{0: 1, 1: w} for w in np.linspace(0.01, 1, 10)]
}


rs_rf_start_time = time.time()
rs = RandomizedSearchCV(
    estimator=rf_clf, 
    param_distributions=rf_random_params, 
    n_iter=768, 
    cv=3, 
    scoring='accuracy', 
    n_jobs=-1, 
    verbose=True, 
    random_state=42
)
rs.fit(x_train, y_train)
rs_rf_end_time = time.time()
rs_rf_seconds = rs_rf_end_time - rs_rf_start_time

print("Best hyperparameters:")
for key, value in rs.best_params_.items():
    print(f"{key} = {value}")
print(f"\nBest Accuracy value: {rs.best_score_}")
print(f"Search took {rs_rf_seconds // 60:.0f} minutes and {rs_rf_seconds % 60:.0f} seconds")

Fitting 3 folds for each of 768 candidates, totalling 2304 fits
Best hyperparameters:
n_estimators = 700
min_samples_split = 4
min_samples_leaf = 1
max_features = 5
max_depth = None
criterion = gini
class_weight = {0: 1, 1: 0.01}

Best Accuracy value: 0.767284425739199
Search took 12 minutes and 48 seconds


In [4]:
rf_clf_rs = rs.best_estimator_

rf_clf_rs.fit(x_train, y_train)

rf_clf_rs_pred = rf_clf_rs.predict(x_test)
print(f"Accuracy on test: {accuracy_score(y_test, rf_clf_rs_pred)}")

Accuracy on test: 0.5770399812953004


### <center>GridSearchCV on RandomForestClassifier

In [5]:
rf_grid_params = {
    'n_estimators': np.arange(400, 701, 100),
    'max_depth': list(range(15, 26, 5)) + [None],
    'max_features': range(3, 6),
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': np.logspace(3, 4, num=2, base=2, dtype=int), # [8, 16]
    'min_samples_split': [10, 18],
    'class_weight': [{0: 1, 1: w} for w in [0.1, 0.3]]
}

gs_rf_start_time = time.time()
gs = GridSearchCV(
    estimator=rf_clf, 
    param_grid=rf_grid_params, 
    cv=3, 
    scoring='accuracy', 
    n_jobs=-1, 
    verbose=True
)
gs.fit(x_train, y_train)
gs_rf_end_time = time.time()
gs_rf_seconds = gs_rf_end_time - gs_rf_start_time

print("Best hyperparameters:")
for key, value in gs.best_params_.items():
    print(f"{key} = {value}")
print(f"\nBest Accuracy value: {gs.best_score_}")
print(f"Search took {gs_rf_seconds // 60:.0f} minutes and {gs_rf_seconds % 60:.0f} seconds")

Fitting 3 folds for each of 768 candidates, totalling 2304 fits
Best hyperparameters:
class_weight = {0: 1, 1: 0.3}
criterion = gini
max_depth = 15
max_features = 5
min_samples_leaf = 8
min_samples_split = 18
n_estimators = 400

Best Accuracy value: 0.7441673358738989
Search took 12 minutes and 23 seconds


In [6]:
rf_clf_gs = gs.best_estimator_

rf_clf_gs.fit(x_train, y_train)

rf_clf_gs_pred = rf_clf_gs.predict(x_test)
print(f"Accuracy on test: {accuracy_score(y_test, rf_clf_gs_pred)}")

Accuracy on test: 0.7142857142857143


### <center>Bayesian Optimization on RandomForestClassifier

In [7]:
def rf_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 400, 700, step=100)
    max_depth = trial.suggest_categorical('max_depth', [None] + list(range(15, 36, 5)))
    max_features = trial.suggest_int('max_features', 3, 5)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 8, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 8, 20)
    class_weight = trial.suggest_categorical('class_weight', [{0: 1, 1: w} for w in np.linspace(0.1, 0.3, 5)])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        criterion=criterion,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        class_weight=class_weight,
        n_jobs=-1,
        random_state=42
    )

    model.fit(x_train, y_train)
    model_pred = model.predict(x_train)
    return accuracy_score(y_train, model_pred)


n_trials = 50
with tqdm(total=n_trials, desc="Optimizing", unit="trial") as pbar:
    def tqdm_callback(study, trial):
        pbar.update(1)

    rf_study = optuna.create_study(direction='maximize')
    bayes_rf_start_time = time.time()
    rf_study.optimize(rf_objective, n_trials=n_trials, callbacks=[tqdm_callback])
    bayes_rf_end_time = time.time()
    bayes_rf_seconds = bayes_rf_end_time - bayes_rf_start_time
    pbar.close()
    

print("Bayesian Optimization Best Parameters:")
for key, value in rf_study.best_params.items():
    print(f"{key}: {value}")
print(f"Bayesian Optimization Best Score: {rf_study.best_value}")
print(f"Search took {bayes_rf_seconds // 60:.0f} minutes and {bayes_rf_seconds % 60:.0f} seconds")

Optimizing: 100%|███████████████████████████████████████████████████████████████████| 50/50 [01:12<00:00,  1.45s/trial]

Bayesian Optimization Best Parameters:
n_estimators: 700
max_depth: None
max_features: 4
criterion: gini
min_samples_leaf: 8
min_samples_split: 14
class_weight: {0: 1, 1: 0.3}
Bayesian Optimization Best Score: 0.7728057057402508
Search took 1 minutes and 12 seconds





In [8]:
rf_clf_bayes = RandomForestClassifier(
    **rf_study.best_params,
    random_state=42
)

rf_clf_bayes.fit(x_train, y_train)
rf_clf_bayes_pred = rf_clf_bayes.predict(x_test)
print(f"Accuracy on test: {accuracy_score(y_test, rf_clf_bayes_pred)}")

Accuracy on test: 0.7278466214636428


### <center>RandomizedSearchCV on XGBoostClassifier

In [9]:
xgb_clf = XGBClassifier(eval_metric='logloss', random_state=42)

xgb_random_params = {
    'n_estimators': np.arange(400, 701, 100),
    'max_depth': list(range(3, 16, 2)),
    'learning_rate': np.linspace(0.01, 0.1, 10),
    'subsample': [1],
    'colsample_bytree': [1],
    'gamma': np.linspace(0, 1.2, 10),
    'min_child_weight': np.arange(1, 10, 1),
    'scale_pos_weight': np.linspace(0.2, 0.5, 10),
    'reg_alpha': np.linspace(3, 8, 5),
    'reg_lambda': np.linspace(3, 8, 5)
}

rs_xgb_start_time = time.time()
rs = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=xgb_random_params,
    n_iter=1536,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=True,
    random_state=42
)
rs.fit(x_train, y_train)
rs_xgb_end_time = time.time()
rs_xgb_seconds = rs_xgb_end_time - rs_xgb_start_time


print("Best hyperparameters:")
for key, value in rs.best_params_.items():
    print(f"{key} = {value}")
print(f"\nBest Accuracy value: {rs.best_score_}")
print(f"Search took {rs_xgb_seconds // 60:.0f} minutes and {rs_xgb_seconds % 60:.0f} seconds")

Fitting 3 folds for each of 1536 candidates, totalling 4608 fits
Best hyperparameters:
subsample = 1
scale_pos_weight = 0.5
reg_lambda = 3.0
reg_alpha = 3.0
n_estimators = 700
min_child_weight = 4
max_depth = 11
learning_rate = 0.07
gamma = 1.0666666666666667
colsample_bytree = 1

Best Accuracy value: 0.7379578630122672
Search took 5 minutes and 35 seconds


In [10]:
xgb_clf_rs = rs.best_estimator_
xgb_clf_rs.fit(x_train, y_train)
xgb_clf_rs_pred = xgb_clf_rs.predict(x_test)
print(f"Accuracy on test: {accuracy_score(y_test, xgb_clf_rs_pred)}")

Accuracy on test: 0.6427402384849193


### <center>GridSearchCV on XGBoostClassifier

In [11]:
xgb_grid_params = {
    'n_estimattors': np.arange(400, 701, 100),
    'max_depth': np.arange(7, 16, 3),
    'learning_rate': np.arange(0.1, 0.4, 0.2),
    'subsample': [1],
    'colsample_bytree': [1],
    'gamma': np.arange(1, 1.3, 0.1),
    'min_child_weight': np.arange(5, 9, 2),
    'scale_pos_weight': np.arange(0.2, 0.6, 0.2),
    'reg_alpha': np.arange(6, 8, 1),
    'reg_lambda': np.arange(5, 7, 1)
}
gs_xgb_start_time = time.time()
gs = GridSearchCV(
    estimator=xgb_clf,
    param_grid=xgb_grid_params,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=True
)
gs.fit(x_train, y_train)
gs_xgb_end_time = time.time()
gs_xgb_seconds = gs_xgb_end_time - gs_xgb_start_time

print("Best hyperparameters:")
for key, value in gs.best_params_.items():
    print(f"{key} = {value}")
print(f"\nBest Accuracy value: {gs.best_score_}")
print(f"Search took {gs_xgb_seconds // 60:.0f} minutes and {gs_xgb_seconds % 60:.0f} seconds")

Fitting 3 folds for each of 1536 candidates, totalling 4608 fits
Best hyperparameters:
colsample_bytree = 1
gamma = 1.2000000000000002
learning_rate = 0.30000000000000004
max_depth = 10
min_child_weight = 5
n_estimattors = 400
reg_alpha = 6
reg_lambda = 5
scale_pos_weight = 0.4
subsample = 1

Best Accuracy value: 0.6978145212450566
Search took 1 minutes and 23 seconds


In [12]:
xgb_clf_gs = gs.best_estimator_
xgb_clf_gs.fit(x_train, y_train)
xgb_clf_gs_pred = xgb_clf_gs.predict(x_test)
print(f"Accuracy on test: {accuracy_score(y_test, xgb_clf_gs_pred)}")

Accuracy on test: 0.6750058452186112


### <center>Bayesian Optimization on XGBoostClassifier

In [13]:
def xgb_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 400, 700, step=100)
    max_depth = trial.suggest_int('max_depth', 10, 20, step=1)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.5, step=0.01)
    subsample = trial.suggest_float('subsample', 0.9, 1.0, step=0.1)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.9, 1.0, step=0.1)
    gamma = trial.suggest_float('gamma', 1.2, 1.9, step=0.1)
    min_child_weight = trial.suggest_int('min_child_weight', 5, 10)
    scale_pos_weight = trial.suggest_float('scale_pos_weight', 0.15, 0.22, step=0.05)
    reg_alpha = trial.suggest_float('reg_alpha', 7, 15, step=0.2)
    reg_lambda = trial.suggest_float('reg_lambda', 5, 8, step=0.5)

    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        gamma=gamma,
        min_child_weight=min_child_weight,
        scale_pos_weight=scale_pos_weight,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        use_label_encoder=False,
        eval_metric='logloss',
        n_jobs=-1,
        random_state=42
    )

    model.fit(x_train, y_train)
    model_pred = model.predict(x_train)
    return accuracy_score(y_train, model_pred)


n_trials = 50
with tqdm(total=n_trials, desc="Optimizing", unit="trial") as pbar:
    def tqdm_callback(study, trial):
        pbar.update(1)

    xgb_study = optuna.create_study(direction='maximize')
    bayes_xgb_start_time = time.time()
    xgb_study.optimize(xgb_objective, n_trials=n_trials, callbacks=[tqdm_callback])
    bayes_xgb_end_time = time.time()
    bayes_xgb_seconds = bayes_xgb_end_time - bayes_xgb_start_time
    pbar.close()


print("Bayesian Optimization Best Parameters:")
for key, value in xgb_study.best_params.items():
    print(f"{key}: {value}")
print(f"Bayesian Optimization Best Score: {xgb_study.best_value}")
print(f"Search took {bayes_xgb_seconds // 60:.0f} minutes and {bayes_xgb_seconds % 60:.0f} seconds")

Optimizing: 100%|███████████████████████████████████████████████████████████████████| 50/50 [00:20<00:00,  2.50trial/s]

Bayesian Optimization Best Parameters:
n_estimators: 700
max_depth: 11
learning_rate: 0.5
subsample: 1.0
colsample_bytree: 1.0
gamma: 1.3
min_child_weight: 6
scale_pos_weight: 0.2
reg_alpha: 7.4
reg_lambda: 5.5
Bayesian Optimization Best Score: 0.7524444955711492
Search took 0 minutes and 20 seconds





In [14]:
xgb_clf_bayes = XGBClassifier(
    **xgb_study.best_params,
    random_state=42
)

xgb_clf_bayes.fit(x_train, y_train)
xgb_clf_bayes_pred = xgb_clf_bayes.predict(x_test)
print(f"Accuracy on test: {accuracy_score(y_test, xgb_clf_bayes_pred)}")

Accuracy on test: 0.729249473930325


In [15]:
print(f"Time spent on RandomizedSearchCV for RandomForestClassifier: {rs_rf_seconds} seconds")
print(f"Time spent on GridSearchCV for RandomForestClassifier: {gs_rf_seconds} seconds")
print(f"Time spent on Bayesian Optimization for RandomForestClassifier: {bayes_rf_seconds} seconds")
print(f"Time spent on RandomizedSearchCV for XGBoostClassifier: {rs_xgb_seconds} seconds")
print(f"Time spent on GridSearchCV for XGBoostClassifier: {gs_xgb_seconds} seconds")
print(f"Time spent on Bayesian Optimization for XGBoostClassifier: {bayes_xgb_seconds} seconds")

Time spent on RandomizedSearchCV for RandomForestClassifier: 767.6952238082886 seconds
Time spent on GridSearchCV for RandomForestClassifier: 743.2279887199402 seconds
Time spent on Bayesian Optimization for RandomForestClassifier: 72.44222092628479 seconds
Time spent on RandomizedSearchCV for XGBoostClassifier: 335.03100514411926 seconds
Time spent on GridSearchCV for XGBoostClassifier: 83.3002302646637 seconds
Time spent on Bayesian Optimization for XGBoostClassifier: 20.032005310058594 seconds
