In [17]:
pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 KB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.2 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1

In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn import datasets
# Hyperopt
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
# Optuna
import optuna
# Grid Search 
from sklearn.model_selection import GridSearchCV
# Random Search
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats
from scipy.special import factorial

In [2]:
# Loading Boston housing dataset's features and target values
data_raw_url = 'https://raw.githubusercontent.com/CallmeQuant/Misc-Dataset/main/Boston%20Housing%20Dataset/housing.csv'
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'Price']
data = pd.read_csv(data_raw_url, header=None, delimiter=r"\s+", names=column_names)

# Splitting into features and target arrays
X = data.loc[:, ~data.columns.isin(['Price'])].values
y = data.loc[:, 'Price'].values

In [3]:
print(X.shape)
print(y.shape)
data.head(10)

(506, 13)
(506,)


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311.0,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311.0,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311.0,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311.0,15.2,386.71,17.1,18.9


## **Random Forest Regressor with Default Hyperparameters**

In [9]:
#Random Forest
%%time
clf = RandomForestRegressor()
scores = cross_val_score(clf, X, y, cv=3,scoring='neg_mean_squared_error') # 3-fold cross-validation
print("MSE:"+ str(-scores.mean()))

MSE:29.700032868648446
CPU times: user 620 ms, sys: 2.11 ms, total: 622 ms
Wall time: 626 ms


## **Hyperparameters Tuning Approach 1: Grid Search Cross-Validation**


In [10]:
# Define hyperparameter configuration space 
%%time
rf_params_space = {
    'n_estimators': [10, 20, 30],
    'max_features': ['sqrt', 0.5],
    'max_depth': [15, 20, 30, 50],
    'min_samples_leaf': [1,2,4,8],
    "bootstrap":[True,False],
    "criterion":['squared_error', 'absolute_error']
}

clf = RandomForestRegressor(random_state=0)
grid = GridSearchCV(clf, rf_params_space, cv = 3, scoring='neg_mean_squared_error')
grid.fit(X, y)
print(grid.best_params_)
print("MSE:"+ str(-grid.best_score_))

{'bootstrap': True, 'criterion': 'squared_error', 'max_depth': 15, 'max_features': 0.5, 'min_samples_leaf': 4, 'n_estimators': 20}
MSE:25.639921597582575
CPU times: user 15.4 s, sys: 68.6 ms, total: 15.5 s
Wall time: 16.8 s


## **Hyperparameters Tuning Approach 1: Random Search Cross-Validation**

In [11]:
# Define hyperparameter configuration space 
%%time
rf_params_space = {
    'n_estimators': sp_randint(10,100),
    "max_features":sp_randint(1,13),
    'max_depth': sp_randint(5,50),
    "min_samples_split":sp_randint(2,11),
    "min_samples_leaf":sp_randint(1,11),
    "criterion":['squared_error', 'absolute_error']
}
n_iter_search=20 # number of iterations is set to 20
clf = RandomForestRegressor(random_state=0)
Random = RandomizedSearchCV(clf, param_distributions=rf_params_space, error_score = 'raise',
                            n_iter=n_iter_search, cv=3, scoring="neg_mean_squared_error")
Random.fit(X, y)
print(Random.best_params_)
print("MSE:"+ str(-Random.best_score_))

{'criterion': 'squared_error', 'max_depth': 21, 'max_features': 6, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 13}
MSE:25.651535014283024
CPU times: user 5.09 s, sys: 21.3 ms, total: 5.11 s
Wall time: 5.14 s


## **Bayesian Optimization with Tree-structured Parzen Estimator (TPE)**
### **Hyperopt**

In [16]:
#Random Forest
# Define the objective function
%%time
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']), 
        'max_depth': int(params['max_depth']),
        'max_features': int(params['max_features']),
        "min_samples_split":int(params['min_samples_split']),
        "min_samples_leaf":int(params['min_samples_leaf']),
        "criterion":str(params['criterion'])
    }
    clf = RandomForestRegressor( **params)
    score = -np.mean(cross_val_score(clf, X, y, cv=3, n_jobs=-1,
                                    scoring="neg_mean_squared_error"))

    return {'loss':score, 'status': STATUS_OK }
# Define the hyperparameter configuration space
space = {
    'n_estimators': hp.quniform('n_estimators', 10, 100, 1),
    'max_depth': hp.quniform('max_depth', 5, 50, 1),
    "max_features":hp.quniform('max_features', 1, 13, 1),
    "min_samples_split":hp.quniform('min_samples_split',2,11,1),
    "min_samples_leaf":hp.quniform('min_samples_leaf',1,11,1),
    "criterion":hp.choice('criterion',['squared_error', 'absolute_error'])
}

best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=20)
print("Random Forest: Hyperopt estimated optimum {}".format(best))

100%|██████████| 20/20 [00:08<00:00,  2.39trial/s, best loss: 25.625549516346542]
Random Forest: Hyperopt estimated optimum {'criterion': 0, 'max_depth': 22.0, 'max_features': 8.0, 'min_samples_leaf': 3.0, 'min_samples_split': 4.0, 'n_estimators': 95.0}
CPU times: user 221 ms, sys: 26.1 ms, total: 247 ms
Wall time: 8.37 s


### **Optuna**

In [20]:
# Define the objective function
%%time
def objective(trial:optuna.trial.Trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 10, 100),
            'max_depth': trial.suggest_int('max_depth', 5, 50),
            'max_features': trial.suggest_int('max_features', 1, 13),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 11),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 11),
        }
        clf = RandomForestRegressor(**params)
        score = -np.mean(cross_val_score(clf, X, y, cv=3, n_jobs=-1,
                                    scoring="neg_mean_squared_error"))
        return score
    
study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective, n_trials = 20)
print(f'===== Optimizing Process Finished =====')
print(study.best_value)
print(study.best_params)

[32m[I 2023-03-20 18:55:40,110][0m A new study created in memory with name: no-name-f6b8e694-eef8-4142-8086-7f3a354c20cb[0m
[32m[I 2023-03-20 18:55:41,764][0m Trial 0 finished with value: 28.252345369370246 and parameters: {'n_estimators': 100, 'max_depth': 35, 'max_features': 4, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 0 with value: 28.252345369370246.[0m
[32m[I 2023-03-20 18:55:41,996][0m Trial 1 finished with value: 30.71170174637524 and parameters: {'n_estimators': 79, 'max_depth': 31, 'max_features': 4, 'min_samples_split': 2, 'min_samples_leaf': 11}. Best is trial 0 with value: 28.252345369370246.[0m
[32m[I 2023-03-20 18:55:42,156][0m Trial 2 finished with value: 32.91101007512376 and parameters: {'n_estimators': 57, 'max_depth': 10, 'max_features': 3, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 0 with value: 28.252345369370246.[0m
[32m[I 2023-03-20 18:55:42,317][0m Trial 3 finished with value: 28.86962236285329 and parameters

===== Optimizing Process Finished =====
26.832964941612005
{'n_estimators': 89, 'max_depth': 45, 'max_features': 6, 'min_samples_split': 10, 'min_samples_leaf': 4}
CPU times: user 430 ms, sys: 36.1 ms, total: 466 ms
Wall time: 7.12 s
