In [None]:
# ONE OFF INSTALL
# %pip install scikit_optimize

In [None]:
# Libraries

import pandas as pd

import matplotlib.pyplot as plt
import numpy as np

from sklearn.pipeline import Pipeline #sklearn==0.23.2
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from lightgbm import LGBMClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import loguniform, randint
from skopt import BayesSearchCV

#### Data import and pre-processing

In [None]:
# Example - source https://www.kaggle.com/datasets/rashikrahmanpritom/heart-attack-analysis-prediction-dataset

df_heart = pd.read_csv('heart.csv')

lables=['<45', '45-60', '60+']
df_heart['age_bins']=pd.cut(x=df_heart['age'],bins=[25,45,60,100], labels=lables, include_lowest=True)

numeric_var = ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak']
categorical_var = ['sex','cp','fbs','restecg','exng','slp','caa','thall', 'age_bins']

df_heart[categorical_var] = df_heart[categorical_var].astype('category')

X = df_heart.drop('output', axis=1)
y = df_heart['output']

In [None]:
# create pipeline for scaling numerical variables and encoding categorical variables

tuples = list()

tuples.append((Pipeline([
        ('scaler', StandardScaler()),
    ]), numeric_var))

tuples.append((Pipeline([
        ('onehot', OneHotEncoder()),
    ]), categorical_var))

preprocess = make_column_transformer(*tuples)

pipe = Pipeline([
    ('preprocess', preprocess),
    ('classifier', LGBMClassifier())
])

#### Compare Search Optimization

NB due to multi-iterative nature of the optimization techniques below, each may take up to 10 mins to run

In [None]:
# Grid Search first

param_grid = {
    "classifier__learning_rate": [0.0001, 0.0005, 0.001, 0.01, 0.1],
    "classifier__n_estimators": [100, 300, 600, 800, 1000],
    "classifier__max_depth": [ 4, 20, 100, 250, 400] 
}

# grid
reg_grid = GridSearchCV(pipe,
                        param_grid=param_grid,
                        cv=5,
                        n_jobs=8, 
                        scoring='roc_auc'
                       )

model_grid = reg_grid.fit(X, y)

In [None]:
# Random Search

n_iter = 70

param_grid = {
    "classifier__learning_rate": loguniform(1e-4, 0.1),
    "classifier__n_estimators": randint(100,1000),
    "classifier__max_depth": randint(4, 400) 
}

# Random
reg_rand = RandomizedSearchCV(pipe,
                         param_distributions=param_grid,
                         n_iter=n_iter,
                         cv=5,
                         n_jobs=8,
                         scoring='roc_auc',
                         random_state=123)

model_rand = reg_rand.fit(X, y)

In [None]:
# Bayesian search

n_iter = 70

param_grid = {
    "classifier__learning_rate": (0.0001, 0.1, "log-uniform"),
    "classifier__n_estimators": (100,  1000) ,
    "classifier__max_depth": (4, 400) 
}

reg_bay = BayesSearchCV(estimator=pipe,
                    search_spaces=param_grid,
                    n_iter=n_iter,
                    cv=5,
                    n_jobs=8,
                    scoring='roc_auc',
                    random_state=123)

model_bay = reg_bay.fit(X, y)

In [None]:
#### Performance Visualisation

In [None]:
# visualise mean score

param = 'param_classifier__learning_rate'

grid = model_grid.cv_results_[param]
rand = model_rand.cv_results_[param]
bay = model_bay.cv_results_[param]

fig = plt.figure(figsize=(15, 7))

ax = plt.gca()
ax.scatter(np.arange(len(grid)), grid.data, c='b', s=20, label='grid');
ax.scatter(np.arange(len(rand)), rand.data, c='r', s=20, label='random');
ax.scatter(np.arange(len(bay)), bay, c='g', s=20, label='bayesian');
ax.set_yscale('log')

plt.legend();
plt.title(param);

In [None]:
# Exercise - plot instead mean test score for each method

In [None]:
param = 'mean test score'

fig = plt.figure(figsize=(15, 7))

ax = plt.gca()
ax.scatter(np.arange(len(grid)), model_grid.cv_results_["mean_test_score"], c='b', s=20, label='grid');
ax.scatter(np.arange(len(rand)), model_rand.cv_results_["mean_test_score"], c='r', s=20, label='random');
ax.scatter(np.arange(len(bay)), model_bay.cv_results_["mean_test_score"], c='g', s=20, label='bayesian');
# ax.set_yscale('log')

plt.legend();
plt.title(param);

Bayesian Search tends to have higher performance (AUC in this case)

In [None]:
# Exercise (stretch) - import a dataset from kaggle e.g. 
# IoT https://www.kaggle.com/code/helloedi/ml-telemetrysensordata
# or retail and perform the same GSO comparison on that dataset

Reference: https://towardsdatascience.com/bayesian-optimization-for-hyperparameter-tuning-how-and-why-655b0ee0b399