In [None]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian_optimization-1.4.3-py3-none-any.whl (18 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-1.4.3 colorama-0.4.6


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from bayes_opt import BayesianOptimization
from sklearn.metrics import r2_score, accuracy_score

In [None]:
data = pd.read_csv('powerplant_energy_data.csv')

print("Column Names with Indexes:")
for idx, col_name in enumerate(data.columns):
    print(f"Index {idx}: {col_name}")
target_col_idx = int(input("Enter the index of the target variable column: "))


X = data.drop(data.columns[target_col_idx], axis=1)
y = data.iloc[:, target_col_idx]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nX (features):")
print(X.head())
print("\ny (target variable):")
print(y.head())

Column Names with Indexes:
Index 0: ambient_temparature
Index 1: exhaust_vaccum
Index 2: ambient_pressure
Index 3: relative_humidity
Index 4: energy_output
Enter the index of the target variable column: 4

X (features):
   ambient_temparature  exhaust_vaccum  ambient_pressure  relative_humidity
0                14.96           41.76           1024.07              73.17
1                25.18           62.96           1020.04              59.08
2                 5.11           39.40           1012.16              92.14
3                20.86           57.32           1010.24              76.64
4                10.82           37.50           1009.23              96.62

y (target variable):
0    463.26
1    444.37
2    488.56
3    446.48
4    473.90
Name: energy_output, dtype: float64


In [None]:
#@title Data Preprocessing


In [None]:
#@title Model Selection

def algorithm_type(x_var, y_var):
    from sklearn.utils.multiclass import type_of_target
    dtype = y_var.dtype
    target_type = type_of_target(y_var)

    if dtype == 'object' or target_type == 'binary':
      problem_type = 'Classification'
      print('Object or Binary target variable detected !')

    elif target_type == 'continuous':
      problem_type = 'Regression'
      print('Continuous target variable detected !')

    elif dtype in ['int64','float64'] or target_type in ['multiclass']:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.23, random_state=42)
        rf_classifier = RandomForestClassifier()
        rf_regressor = RandomForestRegressor()
        rf_classifier.fit(X_train, y_train)
        rf_regressor.fit(X_train, y_train)
        classifier_score = rf_classifier.score(X_test, y_test)
        regressor_score = rf_regressor.score(X_test, y_test)
        if classifier_score > regressor_score:
            problem_type = 'Classification'
        else:
            problem_type = 'Regression'
        print(f'CS:{classifier_score} , RS:{regressor_score}')

    else:
        problem_type = input('''Specify problem type manually -
        (r for Regression / c for Classification) : ''')

    return problem_type


def model_analysis(ptype):
    if ptype.lower() in ['Regression','regression', 'r']:
        models = [
            ('DecisionTreeRegressor', DecisionTreeRegressor()),
            ('RandomForestRegressor', RandomForestRegressor()),
            ('GradientBoostingRegressor', GradientBoostingRegressor())
        ]
    elif ptype.lower() in ['Classification','classification', 'c']:
        models = [
            ('DecisionTreeClassifier', DecisionTreeClassifier()),
            ('RandomForestClassifier', RandomForestClassifier()),
            ('GradientBoostingClassifier', GradientBoostingClassifier())
        ]
    else:
        raise ValueError('Invalid problem type specified.')

    return models


def model_selection(models, X, y, problem_type, cv=3):
    from sklearn.metrics import accuracy_score, r2_score
    best_model = None
    best_score = float('-inf') if problem_type == 'regression' else 0

    for name, model in models:
        scoring = 'r2' if problem_type == 'regression' else 'accuracy'
        scores = cross_val_score(model, X, y, scoring=scoring, cv=cv)
        mean_score = scores.mean()
        if problem_type == 'regression' and mean_score > best_score:
            best_score = mean_score
            best_model = model
        elif problem_type == 'classification' and mean_score > best_score:
            best_score = mean_score
            best_model = model
    return best_model


algorithm = algorithm_type(X, y)
print(f'Algorithm selected : {algorithm}')

models = model_analysis(algorithm)
print(models)

best_model = model_selection(models, X, y, algorithm)
print(f'Model selected : {best_model}')

Continuous target variable detected !
Algorithm selected : regression
[('DecisionTreeRegressor', DecisionTreeRegressor()), ('RandomForestRegressor', RandomForestRegressor()), ('GradientBoostingRegressor', GradientBoostingRegressor())]
Model selected : RandomForestRegressor()


In [None]:
#@title HPO - Bayesian Optimization - Regression
################################################


# Decision Tree Regressor
def optimize_dtr(max_depth, min_samples_split, min_samples_leaf):
    dtr = DecisionTreeRegressor(max_depth=int(max_depth), min_samples_split=int(min_samples_split),
                                 min_samples_leaf=int(min_samples_leaf))
    dtr.fit(X_train, y_train)
    y_pred = dtr.predict(X_test)
    return r2_score(y_test, y_pred)

dtr_bounds = {'max_depth': (1, 50), 'min_samples_split': (2, 20),
              'min_samples_leaf': (1, 20)}

bayes_dtr = BayesianOptimization(f=optimize_dtr, pbounds=dtr_bounds, random_state=42)
bayes_dtr.maximize(init_points=10, n_iter=10)

print("Decision Tree Regressor: Best parameters -", bayes_dtr.max)
print("R-squared Score:", bayes_dtr.max['target'])


# Random Forest Regressor
def optimize_rfr(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    rfr = RandomForestRegressor(n_estimators=int(n_estimators), max_depth=int(max_depth),
                                 min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf))
    rfr.fit(X_train, y_train)
    y_pred = rfr.predict(X_test)
    return r2_score(y_test, y_pred)

rfr_bounds = {'n_estimators': (10, 100), 'max_depth': (1, 50), 'min_samples_split': (2, 20),
              'min_samples_leaf': (1, 20)}

bayes_rfr = BayesianOptimization(f=optimize_rfr, pbounds=rfr_bounds, random_state=42)
bayes_rfr.maximize(init_points=10, n_iter=10)

print("Random Forest Regressor: Best parameters -", bayes_rfr.max)
print("R-squared Score:", bayes_rfr.max['target'])


# Gradient Boosting Regressor
def optimize_gbr(n_estimators, learning_rate, max_depth, min_samples_split, min_samples_leaf, subsample):
    gbr = GradientBoostingRegressor(n_estimators=int(n_estimators), learning_rate=learning_rate,
                                     max_depth=int(max_depth), min_samples_split=int(min_samples_split),
                                     min_samples_leaf=int(min_samples_leaf), subsample=subsample)
    gbr.fit(X_train, y_train)
    y_pred = gbr.predict(X_test)
    return r2_score(y_test, y_pred)

gbr_bounds = {'n_estimators': (10, 100), 'learning_rate': (0.001, 1.0), 'max_depth': (1, 50),
              'min_samples_split': (2, 20), 'min_samples_leaf': (1, 20), 'subsample': (0.1, 1.0)}

bayes_gbr = BayesianOptimization(f=optimize_gbr, pbounds=gbr_bounds, random_state=42)
bayes_gbr.maximize(init_points=10, n_iter=10)

print("GBM Regressor: Best parameters -", bayes_gbr.max)
print("R-squared Score:", bayes_gbr.max['target'])


In [None]:
#@title HPO - Bayesian Optimization - Classification
####################################################


# Decision Tree Classifier
def optimize_dtc(max_depth, min_samples_split, min_samples_leaf):
    dtc = DecisionTreeClassifier(max_depth=int(max_depth), min_samples_split=int(min_samples_split),
                                  min_samples_leaf=int(min_samples_leaf))
    dtc.fit(X_train, y_train)
    y_pred = dtc.predict(X_test)
    return accuracy_score(y_test, y_pred)

dtc_bounds = {'max_depth': (1, 50), 'min_samples_split': (2, 20),
              'min_samples_leaf': (1, 20)}

bayes_dtc = BayesianOptimization(f=optimize_dtc, pbounds=dtc_bounds, random_state=42)
bayes_dtc.maximize(init_points=10, n_iter=10)

print()

# Random Forest Classifier
def optimize_rfc(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    rfc = RandomForestClassifier(n_estimators=int(n_estimators), max_depth=int(max_depth),
                                  min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf))
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)
    return accuracy_score(y_test, y_pred)

rfc_bounds = {'n_estimators': (10, 100), 'max_depth': (1, 50), 'min_samples_split': (2, 20),
              'min_samples_leaf': (1, 20)}

bayes_rfc = BayesianOptimization(f=optimize_rfc, pbounds=rfc_bounds, random_state=42)
bayes_rfc.maximize(init_points=10, n_iter=10)

print()

# Gradient Boosting Classifier
def optimize_gbc(n_estimators, learning_rate, max_depth, min_samples_split, min_samples_leaf, subsample):
    gbc = GradientBoostingClassifier(n_estimators=int(n_estimators), learning_rate=learning_rate,
                                     max_depth=int(max_depth), min_samples_split=int(min_samples_split),
                                     min_samples_leaf=int(min_samples_leaf), subsample=subsample)
    gbc.fit(X_train, y_train)
    y_pred = gbc.predict(X_test)
    return accuracy_score(y_test, y_pred)

gbc_bounds = {'n_estimators': (10, 100), 'learning_rate': (0.001, 1.0), 'max_depth': (1, 50),
              'min_samples_split': (2, 20), 'min_samples_leaf': (1, 20), 'subsample': (0.1, 1.0)}

bayes_gbc = BayesianOptimization(f=optimize_gbc, pbounds=gbc_bounds, random_state=42)
bayes_gbc.maximize(init_points=10, n_iter=10)

print()

print("Decision Tree Classifier: Best parameters -", bayes_dtc.max)
print("Accuracy:", bayes_dtc.max['target'])
print()
print("Random Forest Classifier: Best parameters -", bayes_rfc.max)
print("Accuracy:", bayes_rfc.max['target'])
print()
print("GBM Classifier: Best parameters -", bayes_gbc.max)
print("Accuracy:", bayes_gbc.max['target'])

In [None]:
#@title Optimizer Functions

# Decision Tree Regressor Optimization
def optimize_dtr(X_train, X_test, y_train, y_test):
    def dtr_optimizer(max_depth, min_samples_split, min_samples_leaf):
        dtr = DecisionTreeRegressor(max_depth=int(max_depth), min_samples_split=int(min_samples_split),
                                     min_samples_leaf=int(min_samples_leaf))
        dtr.fit(X_train, y_train)
        y_pred = dtr.predict(X_test)
        return r2_score(y_test, y_pred)

    dtr_bounds = {'max_depth': (1, 50), 'min_samples_split': (2, 20),
                  'min_samples_leaf': (1, 20)}

    bayes_dtr = BayesianOptimization(f=dtr_optimizer, pbounds=dtr_bounds, random_state=42)
    bayes_dtr.maximize(init_points=10, n_iter=10)

    return bayes_dtr, bayes_dtr.max['target']

# Random Forest Regressor Optimization
def optimize_rfr(X_train, X_test, y_train, y_test):
    def rfr_optimizer(n_estimators, max_depth, min_samples_split, min_samples_leaf):
        rfr = RandomForestRegressor(n_estimators=int(n_estimators), max_depth=int(max_depth),
                                     min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf))
        rfr.fit(X_train, y_train)
        y_pred = rfr.predict(X_test)
        return r2_score(y_test, y_pred)

    rfr_bounds = {'n_estimators': (10, 100), 'max_depth': (1, 50), 'min_samples_split': (2, 20),
                  'min_samples_leaf': (1, 20)}

    bayes_rfr = BayesianOptimization(f=rfr_optimizer, pbounds=rfr_bounds, random_state=42)
    bayes_rfr.maximize(init_points=10, n_iter=10)

    return bayes_rfr, bayes_rfr.max['target']

# Gradient Boosting Regressor Optimization
def optimize_gbr(X_train, X_test, y_train, y_test):
    def gbr_optimizer(n_estimators, learning_rate, max_depth, min_samples_split, min_samples_leaf, subsample):
        gbr = GradientBoostingRegressor(n_estimators=int(n_estimators), learning_rate=learning_rate,
                                         max_depth=int(max_depth), min_samples_split=int(min_samples_split),
                                         min_samples_leaf=int(min_samples_leaf), subsample=subsample)
        gbr.fit(X_train, y_train)
        y_pred = gbr.predict(X_test)
        return r2_score(y_test, y_pred)

    gbr_bounds = {'n_estimators': (10, 100), 'learning_rate': (0.001, 1.0), 'max_depth': (1, 50),
                  'min_samples_split': (2, 20), 'min_samples_leaf': (1, 20), 'subsample': (0.1, 1.0)}

    bayes_gbr = BayesianOptimization(f=gbr_optimizer, pbounds=gbr_bounds, random_state=42)
    bayes_gbr.maximize(init_points=10, n_iter=10)

    return bayes_gbr, bayes_gbr.max['target']


# Decision Tree Classifier Optimization
def optimize_dtc(X_train, X_test, y_train, y_test):
    def optimize_dtc_inner(max_depth, min_samples_split, min_samples_leaf):
        dtc = DecisionTreeClassifier(max_depth=int(max_depth), min_samples_split=int(min_samples_split),
                                      min_samples_leaf=int(min_samples_leaf))
        dtc.fit(X_train, y_train)
        y_pred = dtc.predict(X_test)
        return accuracy_score(y_test, y_pred)

    dtc_bounds = {'max_depth': (1, 50), 'min_samples_split': (2, 20),
                  'min_samples_leaf': (1, 20)}

    bayes_dtc = BayesianOptimization(f=optimize_dtc_inner, pbounds=dtc_bounds, random_state=42)
    bayes_dtc.maximize(init_points=10, n_iter=10)

    return bayes_dtc, bayes_dtc.max['target']

# Random Forest Classifier Optimization
def optimize_rfc(X_train, X_test, y_train, y_test):
    def rfc_optimizer(n_estimators, max_depth, min_samples_split, min_samples_leaf):
        rfc = RandomForestClassifier(n_estimators=int(n_estimators), max_depth=int(max_depth),
                                      min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf))
        rfc.fit(X_train, y_train)
        y_pred = rfc.predict(X_test)
        return accuracy_score(y_test, y_pred)

    rfc_bounds = {'n_estimators': (10, 100), 'max_depth': (1, 50), 'min_samples_split': (2, 20),
                  'min_samples_leaf': (1, 20)}

    bayes_rfc = BayesianOptimization(f=rfc_optimizer, pbounds=rfc_bounds, random_state=42)
    bayes_rfc.maximize(init_points=10, n_iter=10)

    return bayes_rfc, bayes_rfc.max['target']

# Gradient Boosting Classifier Optimization
def optimize_gbc(X_train, X_test, y_train, y_test):
    def gbc_optimizer(n_estimators, learning_rate, max_depth, min_samples_split, min_samples_leaf, subsample):
        gbc = GradientBoostingClassifier(n_estimators=int(n_estimators), learning_rate=learning_rate,
                                         max_depth=int(max_depth), min_samples_split=int(min_samples_split),
                                         min_samples_leaf=int(min_samples_leaf), subsample=subsample)
        gbc.fit(X_train, y_train)
        y_pred = gbc.predict(X_test)
        return accuracy_score(y_test, y_pred)

    gbc_bounds = {'n_estimators': (10, 100), 'learning_rate': (0.001, 1.0), 'max_depth': (1, 50),
                  'min_samples_split': (2, 20), 'min_samples_leaf': (1, 20), 'subsample': (0.1, 1.0)}

    bayes_gbc = BayesianOptimization(f=gbc_optimizer, pbounds=gbc_bounds, random_state=42)
    bayes_gbc.maximize(init_points=10, n_iter=10)

    return bayes_gbc, bayes_gbc.max['target']

In [None]:
#@title Hyperparameter tuning
def optimize_model(algorithm, model, X_train, X_test, y_train, y_test):
    print(algorithm)
    print(model)

    if algorithm in ['Regression','regression']:
      if isinstance(model, DecisionTreeRegressor):
          optimized_model = optimize_dtr(X_train, X_test, y_train, y_test)
      elif isinstance(model, RandomForestRegressor):
          optimized_model = optimize_rfr(X_train, X_test, y_train, y_test)
      elif isinstance(model, GradientBoostingRegressor):
          optimized_model = optimize_gbr(X_train, X_test, y_train, y_test)

    elif algorithm in ['Classification','classification']:
      if isinstance(model, DecisionTreeClassifier):
          optimized_model = optimize_dtc(X_train, X_test, y_train, y_test)
      elif isinstance(model, RandomForestClassifier):
          optimized_model = optimize_rfc(X_train, X_test, y_train, y_test)
      elif isinstance(model, GradientBoostingClassifier):
          optimized_model = optimize_gbc(X_train, X_test, y_train, y_test)

    else:
        print('No model selected !!')
        optimized_model = None

    return optimized_model

final_model, performace = optimize_model(algorithm, best_model, X_train, X_test, y_train, y_test)
print(f"Model : {best_model}")
print(f"Performance : {performace}")
print(final_model)

regression
RandomForestRegressor()
No model selected !!
Model : RandomForestRegressor()
Performance : None
