In [None]:
# !pip install bayesian-optimization
# !pip install shap

In [None]:
#@title Importing Libraries

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from bayes_opt import BayesianOptimization
from sklearn.metrics import r2_score, accuracy_score

In [None]:
#@title Preprocessing Database

irrelevant_columns = [
    "ID", "id", "Index", "index", "Serial Number", "serial_number",
    "Address", "address", "Href", "href", "Timestamp", "timestamp",
    "Creation Date", "creation_date", "Last Updated Date", "last_updated_date",
    "Version", "version", "Checksum", "checksum", "Year", "year"
    "Row ID", "row_id", "Record ID", "record_id", "Customer ID", "customer_id",
    "Client ID", "client_id", "Account ID", "account_id", "Transaction ID", "transaction_id",
    "Email", "email", "Phone Number", "phone_number", "Website", "website",
    "Fax", "fax", "IP Address", "ip_address", "MAC Address", "mac_address",
    "Social Security Number", "social_security_number", "Driver's License", "drivers_license",
    "ID_", "id_", "Index_", "index_", "Serial_Number", "serial_number_",
    "Address_", "address_", "Href_", "href_", "Timestamp_", "timestamp_",
    "Creation_Date", "creation_date_", "Last_Updated_Date", "last_updated_date_",
    "Version_", "version_", "Checksum_", "checksum_",
    "Row_ID", "row_id_", "Record_ID", "record_id_", "Customer_ID", "customer_id_",
    "Client_ID", "client_id_", "Account_ID", "account_id_", "Transaction_ID", "transaction_id_",
    "Email_", "email_", "Phone_Number", "phone_number_", "Website_", "website_",
    "Fax_", "fax_", "IP_Address", "ip_address_", "MAC_Address", "mac_address_",
    "Social_Security_Number", "social_security_number_", "Driver's_License", "drivers_license_"
]

ordinal_data = {
    'low':1,
    'medium':2,
    'moderate':2,
    'high':3,
    # Education Level
    'high school diploma': 1,
    'associate\'s degree': 2,
    'bachelor\'s degree': 3,
    'master\'s degree': 4,
    'doctorate degree': 5,
    # Income Level
    'low income': 1,
    'middle income': 2,
    'high income': 3,
    # Customer Satisfaction
    'very dissatisfied': 1,
    'dissatisfied': 2,
    'neutral': 3,
    'satisfied': 4,
    'very satisfied': 5,
    # Likert Scale
    'strongly disagree': 1,
    'disagree': 2,
    'neither agree nor disagree': 3,
    'agree': 4,
    'strongly agree': 5,
    # Job Seniority
    'entry-level': 1,
    'mid-level': 2,
    'senior-level': 3,
    'executive-level': 4,
    # Severity of Illness/Condition
    'mild': 1,
    'moderate': 2,
    'severe': 3,
    # Temperature
    'cold': 1,
    'warm': 2,
    'hot': 3,
    'very hot': 4,
    # Customer Rating
    '1 star': 1,
    '2 stars': 2,
    '3 stars': 3,
    '4 stars': 4,
    '5 stars': 5,
    # Likelihood of Purchase
    'very unlikely': 1,
    'unlikely': 2,
    'likely': 4,
    'very likely': 5,
    # Degree of Agreement
    'strongly disagree': 1,
    'disagree': 2,
    'neutral': 3,
    'agree': 4,
    'strongly agree': 5,
    # Pain Scale
    'no pain': 1,
    'mild pain': 2,
    'moderate pain': 3,
    'severe pain': 4,
    'extreme pain': 5,
    # Likelihood of Recommendation
    'very unlikely to recommend': 1,
    'unlikely to recommend': 2,
    'likely to recommend': 4,
    'very likely to recommend': 5,
    # Quality Ratings
    'poor quality': 1,
    'fair quality': 2,
    'good quality': 3,
    'very good quality': 4,
    'excellent quality': 5,
    # Customer Service Experience
    'very poor': 1,
    'poor': 2,
    'average': 3,
    'good': 4,
    'excellent': 5,
    # Ease of Use
    'very difficult': 1,
    'difficult': 2,
    'easy': 4,
    'very easy': 5,
    # Likelihood of Churn
    'very unlikely to churn': 1,
    'unlikely to churn': 2,
    'likely to churn': 4,
    'very likely to churn': 5,
    # Satisfaction with Product/Service
    'not satisfied': 1,
    'slightly satisfied': 2,
    'moderately satisfied': 3,
    'extremely satisfied': 5,
    # Risk Levels
    'low risk': 1,
    'moderate risk': 2,
    'high risk': 3,
    # Performance Ratings
    'below expectations': 1,
    'meeting expectations': 2,
    'exceeding expectations': 3 }

In [None]:
#@title Data Preprocessing

def remove_irrelevant_columns(data, irrelevant_columns):
  columns_to_delete = [col for col in data.columns if col in irrelevant_columns]

  if len(columns_to_delete)>0:
    data.drop(columns=columns_to_delete, inplace=True)

  return data


def remove_duplicate_values(data):
  if data.duplicated().any():
    data.drop_duplicates(inplace=True)

  return data


def remove_constant_values(data):
  constant_columns = [col for col in data.columns if data[col].nunique() == 1]

  if len(constant_columns) > 0:
    data.drop(columns=constant_columns, inplace=True)

  return data


def remove_string_numerical(data):
  string_num_cols = [col for col in data.columns if data[col].dtype == 'object' and data[col].str.isnumeric().all()]

  if len(string_num_cols) > 0:
    data[string_num_cols] = data[string_num_cols].apply(pd.to_numeric)

  return data


def remove_object_numerical(data):
  import re
  object_cols = data.select_dtypes(include=['object']).columns

  if len(object_cols) > 0:
    for col in object_cols:
      numerical_values = data[col].apply(lambda x: re.findall(r'\d+\.\d+|\d+', str(x)))
      numeric_col = col + '_numeric'
      data[numeric_col] = numerical_values.apply(lambda x: float(x[0]) if x else None)

  return data


def missing_values(data, threshold=0.3, k_neighbors=5):
  numerical_cols = data.select_dtypes(include=['number']).columns
  categorical_cols = data.select_dtypes(include=['object']).columns

  if data.isnull().any().any():
      missing_percentage = data.isnull().mean()

      if (missing_percentage < threshold).any():
        imputer = KNNImputer(n_neighbors=k_neighbors)
        strategy = 'knn'
      else:
        imputer = SimpleImputer(strategy='mean')
        strategy = 'mean'
      if strategy == 'knn':
        imputer = KNNImputer(n_neighbors=k_neighbors)
      else:
        imputer = SimpleImputer(strategy=strategy)
      data[numerical_cols] = imputer.fit_transform(data[numerical_cols])
      mode_imputer = SimpleImputer(strategy='most_frequent')
      data[categorical_cols] = mode_imputer.fit_transform(data[categorical_cols])

  return data


# def convert_datetime(data):
#     object_cols = data.select_dtypes(include=['object']).columns

#     if len(object_cols) > 0:
#         for col in object_cols:
#             try:
#                 data[col] = pd.to_datetime(data[col])
#                 data[col + '_numeric'] = data[col].astype('int64') // 10**9
#                 data.drop(columns=[col], inplace=True)
#             except (ValueError, TypeError):
#                 pass

#     return data


def encode_objects(data):
  categorical_columns = data.select_dtypes(include=['object']).columns

  if len(categorical_columns) > 0:
    for col in categorical_columns:
      unique_values_count = data[col].nunique()

      if unique_values_count == 2:
        encoder = LabelEncoder()
        data[col] = encoder.fit_transform(data[col])

      elif unique_values_count <= 7:     #RE
          if all(value in ordinal_data for value in data[col].str.lower()):
            data[col] = data[col].str.lower().map(ordinal_data)
          elif any(word in data[col].str.lower() for word in ['low', 'medium', 'moderate', 'high']):
            data[col] = data[col].apply(lambda x: ordinal_data[x.lower()] if x.lower() in ordinal_data else x)
          else:
            encoder = OneHotEncoder(sparse_output=False, drop='first')
            encoded_values = encoder.fit_transform(data[[col]])
            col_names = [f"{col}_{value}" for value in encoder.categories_[0][1:]]
            df = pd.DataFrame(encoded_values, columns=col_names)
            data = pd.concat([data, df], axis=1)
            data.drop(columns=[col], inplace=True)

      #else : LabelEncoder()

  return data


def adjust_values(data):
  scaler = StandardScaler()
  scaler.fit_transform(data)
  #scipy
  return data


#master function
def preprocess_data(data, threshold=0.5, k_neighbors=5):

    data = remove_irrelevant_columns(data, irrelevant_columns)
    data = remove_duplicate_values(data)
    data = remove_constant_values(data)
    data = remove_string_numerical(data)
    data = remove_object_numerical(data)
    data = missing_values(data, threshold, k_neighbors)
    # data = convert_datetime(data)
    data = encode_objects(data) #
    # data = adjust_values(data)

    return data

In [None]:
#feature selection

In [None]:
#@title Model Selection

def algorithm_type(x_var, y_var):
    from sklearn.utils.multiclass import type_of_target
    dtype = y_var.dtype    #int float object
    target_type = type_of_target(y_var)   #binary continuous multiclass

    if dtype == 'object' or target_type == 'binary':
      problem_type = 'Classification'
      print('Object or Binary target variable detected !')

    elif target_type == 'continuous':
      problem_type = 'Regression'
      print('Continuous target variable detected !')

    elif dtype in ['int64','float64'] or target_type in ['multiclass']:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.23, random_state=42)
        rf_classifier = RandomForestClassifier()  #accuracy score
        rf_regressor = RandomForestRegressor()  #r2 score
        rf_classifier.fit(X_train, y_train)
        rf_regressor.fit(X_train, y_train)
        classifier_score = rf_classifier.score(X_test, y_test)
        regressor_score = rf_regressor.score(X_test, y_test)
        if classifier_score > regressor_score:
            problem_type = 'Classification'
        else:
            problem_type = 'Regression'
        print(f'CS:{classifier_score} , RS:{regressor_score}')

    else:
        problem_type = input('''Specify problem type manually -
        (r for Regression / c for Classification) : ''')

    return problem_type


def model_analysis(ptype):
    if ptype.lower() in ['Regression','regression', 'r']:
        models = [
            ('DecisionTreeRegressor', DecisionTreeRegressor()),
            ('RandomForestRegressor', RandomForestRegressor()),
            ('GradientBoostingRegressor', GradientBoostingRegressor())
        ]
    elif ptype.lower() in ['Classification','classification', 'c']:
        models = [
            ('DecisionTreeClassifier', DecisionTreeClassifier()),
            ('RandomForestClassifier', RandomForestClassifier()),
            ('GradientBoostingClassifier', GradientBoostingClassifier())
        ]
    else:
        raise ValueError('Invalid problem type specified.')

    return models


def model_selection(models, X, y, problem_type, cv=3):   #sampling original data
    from sklearn.metrics import accuracy_score, r2_score
    best_model = None
    best_score = float('-inf') if problem_type == 'regression' else 0

    for name, model in models:
        scoring = 'r2' if problem_type == 'regression' else 'accuracy'
        scores = cross_val_score(model, X, y, scoring=scoring, cv=cv)
        mean_score = scores.mean()
        if problem_type == 'regression' and mean_score > best_score:
            best_score = mean_score
            best_model = model
        elif problem_type == 'classification' and mean_score > best_score:
            best_score = mean_score
            best_model = model
    return best_model

In [None]:
#@title Optimizers

# # Decision Tree Regressor Optimization
# def optimize_dtr(X_train, X_test, y_train, y_test):
#     def optimize_dtr_inner(max_depth, min_samples_split, min_samples_leaf):
#         dtr = DecisionTreeRegressor(max_depth=int(max_depth), min_samples_split=int(min_samples_split),
#                                      min_samples_leaf=int(min_samples_leaf))
#         dtr.fit(X_train, y_train)
#         y_pred = dtr.predict(X_test)
#         return r2_score(y_test, y_pred)

#     dtr_bounds = {'max_depth': (1, 50), 'min_samples_split': (2, 20),
#                   'min_samples_leaf': (1, 20)}

#     bayes_dtr = BayesianOptimization(f=optimize_dtr_inner, pbounds=dtr_bounds, random_state=42)
#     bayes_dtr.maximize(init_points=10, n_iter=10)

#     return bayes_dtr, bayes_dtr.max['target']

# # Random Forest Regressor Optimization
# def optimize_rfr(X_train, X_test, y_train, y_test):
#     def optimize_rfr_inner(n_estimators, max_depth, min_samples_split, min_samples_leaf):
#         rfr = RandomForestRegressor(n_estimators=int(n_estimators), max_depth=int(max_depth),
#                                      min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf))
#         rfr.fit(X_train, y_train)
#         y_pred = rfr.predict(X_test)
#         return r2_score(y_test, y_pred)

#     rfr_bounds = {'n_estimators': (10, 100), 'max_depth': (1, 50), 'min_samples_split': (2, 20),
#                   'min_samples_leaf': (1, 20)}

#     bayes_rfr = BayesianOptimization(f=optimize_rfr_inner, pbounds=rfr_bounds, random_state=42)
#     bayes_rfr.maximize(init_points=10, n_iter=10)

#     return bayes_rfr, bayes_rfr.max['target']

# # Gradient Boosting Regressor Optimization
# def optimize_gbr(X_train, X_test, y_train, y_test):
#     def optimize_gbr_inner(n_estimators, learning_rate, max_depth, min_samples_split, min_samples_leaf, subsample):
#         gbr = GradientBoostingRegressor(n_estimators=int(n_estimators), learning_rate=learning_rate,
#                                          max_depth=int(max_depth), min_samples_split=int(min_samples_split),
#                                          min_samples_leaf=int(min_samples_leaf), subsample=subsample)
#         gbr.fit(X_train, y_train)
#         y_pred = gbr.predict(X_test)
#         return r2_score(y_test, y_pred)

#     gbr_bounds = {'n_estimators': (10, 100), 'learning_rate': (0.001, 1.0), 'max_depth': (1, 50),
#                   'min_samples_split': (2, 20), 'min_samples_leaf': (1, 20), 'subsample': (0.1, 1.0)}

#     bayes_gbr = BayesianOptimization(f=optimize_gbr_inner, pbounds=gbr_bounds, random_state=42)
#     bayes_gbr.maximize(init_points=10, n_iter=10)

#     return bayes_gbr, bayes_gbr.max['target']


# # Decision Tree Classifier Optimization
# def optimize_dtc(X_train, X_test, y_train, y_test):
#     def optimize_dtc_inner(max_depth, min_samples_split, min_samples_leaf):
#         dtc = DecisionTreeClassifier(max_depth=int(max_depth), min_samples_split=int(min_samples_split),
#                                       min_samples_leaf=int(min_samples_leaf))
#         dtc.fit(X_train, y_train)
#         y_pred = dtc.predict(X_test)
#         return accuracy_score(y_test, y_pred)

#     dtc_bounds = {'max_depth': (1, 50), 'min_samples_split': (2, 20),
#                   'min_samples_leaf': (1, 20)}

#     bayes_dtc = BayesianOptimization(f=optimize_dtc_inner, pbounds=dtc_bounds, random_state=42)
#     bayes_dtc.maximize(init_points=10, n_iter=10)

#     return bayes_dtc, bayes_dtc.max['target']

# # Random Forest Classifier Optimization
# def optimize_rfc(X_train, X_test, y_train, y_test):
#     def optimize_rfc_inner(n_estimators, max_depth, min_samples_split, min_samples_leaf):
#         rfc = RandomForestClassifier(n_estimators=int(n_estimators), max_depth=int(max_depth),
#                                       min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf))
#         rfc.fit(X_train, y_train)
#         y_pred = rfc.predict(X_test)
#         return accuracy_score(y_test, y_pred)

#     rfc_bounds = {'n_estimators': (10, 100), 'max_depth': (1, 50), 'min_samples_split': (2, 20),
#                   'min_samples_leaf': (1, 20)}

#     bayes_rfc = BayesianOptimization(f=optimize_rfc_inner, pbounds=rfc_bounds, random_state=42)
#     bayes_rfc.maximize(init_points=10, n_iter=10)

#     return bayes_rfc, bayes_rfc.max['target']

# # Gradient Boosting Classifier Optimization
# def optimize_gbc(X_train, X_test, y_train, y_test):
#     def optimize_gbc_inner(n_estimators, learning_rate, max_depth, min_samples_split, min_samples_leaf, subsample):
#         gbc = GradientBoostingClassifier(n_estimators=int(n_estimators), learning_rate=learning_rate,
#                                          max_depth=int(max_depth), min_samples_split=int(min_samples_split),
#                                          min_samples_leaf=int(min_samples_leaf), subsample=subsample)
#         gbc.fit(X_train, y_train)
#         y_pred = gbc.predict(X_test)
#         return accuracy_score(y_test, y_pred)

#     gbc_bounds = {'n_estimators': (10, 100), 'learning_rate': (0.001, 1.0), 'max_depth': (1, 50),
#                   'min_samples_split': (2, 20), 'min_samples_leaf': (1, 20), 'subsample': (0.1, 1.0)}

#     bayes_gbc = BayesianOptimization(f=optimize_gbc_inner, pbounds=gbc_bounds, random_state=42)
#     bayes_gbc.maximize(init_points=10, n_iter=10)

#     return bayes_gbc, bayes_gbc.max['target']

In [None]:
#@title Optimizer Functions

# Decision Tree Regressor Optimization
def optimize_dtr(X_train, X_test, y_train, y_test):
    def dtr_optimizer(max_depth, min_samples_split, min_samples_leaf):
        dtr = DecisionTreeRegressor(max_depth=int(max_depth), min_samples_split=int(min_samples_split),
                                     min_samples_leaf=int(min_samples_leaf))
        dtr.fit(X_train, y_train)
        y_pred = dtr.predict(X_test)
        return r2_score(y_test, y_pred), {'max_depth': int(max_depth),
                                          'min_samples_split': int(min_samples_split),
                                          'min_samples_leaf': int(min_samples_leaf)}

    dtr_bounds = {'max_depth': (1, 50), 'min_samples_split': (2, 20),
                  'min_samples_leaf': (1, 20)}

    bayes_dtr = BayesianOptimization(f=dtr_optimizer, pbounds=dtr_bounds, random_state=42)
    bayes_dtr.maximize(init_points=10, n_iter=10)
    best_params = bayes_dtr.max['params']

    dtr_final = DecisionTreeClassifier(max_depth=int(round(best_params['max_depth'])),
                                       min_samples_split=int(best_params['min_samples_split']),
                                       min_samples_leaf=int(best_params['min_samples_leaf']))
    dtr_final.fit(X_train, y_train)

    y_pred = dtr_final.predict(X_test)
    performance = r2_score(y_test, y_pred)

    return dtr_final, performance


# Random Forest Regressor Optimization
def optimize_rfr(X_train, X_test, y_train, y_test):
    def rfr_optimizer(n_estimators, max_depth, min_samples_split, min_samples_leaf):
        rfr = RandomForestRegressor(n_estimators=int(round(n_estimators)), max_depth=int(round(max_depth)),
                                     min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf))
        rfr.fit(X_train, y_train)
        y_pred = rfr.predict(X_test)
        return r2_score(y_test, y_pred)

    rfr_bounds = {'n_estimators': (10, 100), 'max_depth': (1, 50), 'min_samples_split': (2, 20),
                  'min_samples_leaf': (1, 20)}

    bayes_rfr = BayesianOptimization(f=rfr_optimizer, pbounds=rfr_bounds, random_state=42)
    bayes_rfr.maximize(init_points=10, n_iter=10)
    best_params = bayes_rfr.max['params']

    rfr_final = RandomForestRegressor(n_estimators=int(round(best_params['n_estimators'])),
                                      max_depth=int(round(best_params['max_depth'])),
                                      min_samples_split=int(best_params['min_samples_split']),
                                      min_samples_leaf=int(best_params['min_samples_leaf']))
    rfr_final.fit(X_train, y_train)

    y_pred = rfr_final.predict(X_test)
    performance = r2_score(y_test, y_pred)

    return rfr_final, performance


# Gradient Boosting Regressor Optimization
def optimize_gbr(X_train, X_test, y_train, y_test):
    def gbr_optimizer(n_estimators, learning_rate, max_depth, min_samples_split, min_samples_leaf, subsample):
        gbr = GradientBoostingRegressor(n_estimators=int(round(n_estimators)), learning_rate=learning_rate,
                                         max_depth=int(round(max_depth)), min_samples_split=int(min_samples_split),
                                         min_samples_leaf=int(min_samples_leaf), subsample=subsample)
        gbr.fit(X_train, y_train)
        y_pred = gbr.predict(X_test)
        return r2_score(y_test, y_pred)

    gbr_bounds = {'n_estimators': (10, 100), 'learning_rate': (0.001, 1.0), 'max_depth': (1, 50),
                  'min_samples_split': (2, 20), 'min_samples_leaf': (1, 20), 'subsample': (0.1, 1.0)}

    bayes_gbr = BayesianOptimization(f=gbr_optimizer, pbounds=gbr_bounds, random_state=42)
    bayes_gbr.maximize(init_points=10, n_iter=10)
    best_params = bayes_gbr.max['params']

    gbr_final = GradientBoostingRegressor(n_estimators=int(round(best_params['n_estimators'])),
                                           learning_rate=best_params['learning_rate'],
                                           max_depth=int(round(best_params['max_depth'])),
                                           min_samples_split=int(best_params['min_samples_split']),
                                           min_samples_leaf=int(best_params['min_samples_leaf']),
                                           subsample=best_params['subsample'])
    gbr_final.fit(X_train, y_train)

    y_pred = gbr_final.predict(X_test)
    performance = r2_score(y_test, y_pred)

    return gbr_final, performance


# Decision Tree Classifier Optimization
def optimize_dtc(X_train, X_test, y_train, y_test):
    def dtc_optimizer(max_depth, min_samples_split, min_samples_leaf):
        dtc = DecisionTreeClassifier(max_depth=int(round(max_depth)), min_samples_split=int(min_samples_split),
                                      min_samples_leaf=int(min_samples_leaf))
        dtc.fit(X_train, y_train)
        y_pred = dtc.predict(X_test)
        return accuracy_score(y_test, y_pred)

    dtc_bounds = {'max_depth': (1, 50), 'min_samples_split': (2, 20),
                  'min_samples_leaf': (1, 20)}

    bayes_dtc = BayesianOptimization(f=dtc_optimizer, pbounds=dtc_bounds, random_state=42)
    bayes_dtc.maximize(init_points=10, n_iter=10)
    best_params = bayes_dtc.max['params']

    dtc_final = DecisionTreeClassifier(max_depth=int(round(best_params['max_depth'])),
                                       min_samples_split=int(best_params['min_samples_split']),
                                       min_samples_leaf=int(best_params['min_samples_leaf']))
    dtc_final.fit(X_train, y_train)

    y_pred = dtc_final.predict(X_test)
    performance = accuracy_score(y_test, y_pred)

    return dtc_final, performance


# Random Forest Classifier Optimization
def optimize_rfc(X_train, X_test, y_train, y_test):
    def rfc_optimizer(n_estimators, max_depth, min_samples_split, min_samples_leaf):
        rfc = RandomForestClassifier(n_estimators=int(round(n_estimators)), max_depth=int(round(max_depth)),
                                      min_samples_split=int(min_samples_split), min_samples_leaf=int(min_samples_leaf))
        rfc.fit(X_train, y_train)
        y_pred = rfc.predict(X_test)
        return accuracy_score(y_test, y_pred)

    rfc_bounds = {'n_estimators': (10, 100), 'max_depth': (1, 50), 'min_samples_split': (2, 20),
                  'min_samples_leaf': (1, 20)}

    bayes_rfc = BayesianOptimization(f=rfc_optimizer, pbounds=rfc_bounds, random_state=42)
    bayes_rfc.maximize(init_points=10, n_iter=10)
    best_params = bayes_rfc.max['params']

    rfc_final = RandomForestClassifier(n_estimators=int(round(best_params['n_estimators'])),
                                       max_depth=int(round(best_params['max_depth'])),
                                       min_samples_split=int(best_params['min_samples_split']),
                                       min_samples_leaf=int(best_params['min_samples_leaf']))
    rfc_final.fit(X_train, y_train)

    y_pred = rfc_final.predict(X_test)
    performance = accuracy_score(y_test, y_pred)

    return rfc_final, performance


# Gradient Boosting Classifier Optimization
def optimize_gbc(X_train, X_test, y_train, y_test):
    def gbc_optimizer(n_estimators, learning_rate, max_depth, min_samples_split, min_samples_leaf, subsample):
        gbc = GradientBoostingClassifier(n_estimators=int(n_estimators), learning_rate=learning_rate,
                                         max_depth=int(max_depth), min_samples_split=int(min_samples_split),
                                         min_samples_leaf=int(min_samples_leaf), subsample=subsample)
        gbc.fit(X_train, y_train)
        y_pred = gbc.predict(X_test)
        return accuracy_score(y_test, y_pred), {'n_estimators': int(n_estimators),
                                                'learning_rate': learning_rate,
                                                'max_depth': int(max_depth),
                                                'min_samples_split': int(min_samples_split),
                                                'min_samples_leaf': int(min_samples_leaf),
                                                'subsample': subsample}

    gbc_bounds = {'n_estimators': (10, 100), 'learning_rate': (0.001, 1.0), 'max_depth': (1, 50),
                  'min_samples_split': (2, 20), 'min_samples_leaf': (1, 20), 'subsample': (0.1, 1.0)}

    bayes_gbc = BayesianOptimization(f=gbc_optimizer, pbounds=gbc_bounds, random_state=42)
    bayes_gbc.maximize(init_points=10, n_iter=10)
    best_params = bayes_gbc.max['params']
    accuracy = bayes_gbc.max['target']

    gbc_final = GradientBoostingClassifier(n_estimators=int(round(best_params['n_estimators'])),
                                           learning_rate=best_params['learning_rate'],
                                           max_depth=int(round(best_params['max_depth'])),
                                           min_samples_split=int(best_params['min_samples_split']),
                                           min_samples_leaf=int(best_params['min_samples_leaf']),
                                           subsample=best_params['subsample'])
    gbc_final.fit(X_train, y_train)

    y_pred = gbc_final.predict(X_test)
    performance = accuracy_score(y_test, y_pred)

    return gbc_final, performance

In [None]:
#@title Model Optimization and Training
def optimize_model(algorithm, model, X_train, X_test, y_train, y_test):

    if algorithm.lower() == 'regression':
      if isinstance(model, DecisionTreeRegressor):
          optimized_model, score = optimize_dtr(X_train, X_test, y_train, y_test)
      elif isinstance(model, RandomForestRegressor):
          optimized_model, score = optimize_rfr(X_train, X_test, y_train, y_test)
      elif isinstance(model, GradientBoostingRegressor):
          optimized_model, score = optimize_gbr(X_train, X_test, y_train, y_test)

    elif algorithm.lower() == 'classification':
      if isinstance(model, DecisionTreeClassifier):
          optimized_model, score = optimize_dtc(X_train, X_test, y_train, y_test)
      elif isinstance(model, RandomForestClassifier):
          optimized_model, score = optimize_rfc(X_train, X_test, y_train, y_test)
      elif isinstance(model, GradientBoostingClassifier):
          optimized_model, score = optimize_gbc(X_train, X_test, y_train, y_test)

    else:
        print('No model selected !!')
        optimized_model = None

    return optimized_model, score

In [None]:
dataset = pd.read_csv('powerplant_energy_data.csv')

data = preprocess_data(dataset)

print("Column Names with Indexes:")
for idx, col_name in enumerate(data.columns):
    print(f"Index {idx}: {col_name}")
target_col_idx = int(input("Enter the index of the target variable column: "))

In [None]:
X = data.drop(data.columns[target_col_idx], axis=1)
y = data.iloc[:, target_col_idx]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
algorithm = algorithm_type(X, y)
print(f'{algorithm} Analysis : ')

models = model_analysis(algorithm)
print(models)

best_model = model_selection(models, X, y, algorithm)
print(f'Model selected : {best_model} !!')

final_model, performace = optimize_model(algorithm, best_model, X_train, X_test, y_train, y_test)
print(f"Model : {best_model}")
print(f"Performance : {performace}")